Skip to content

Instantly share code, notes, and snippets.

Created March 23, 2012 08:23
Show Gist options
  • Save Fil/2168253 to your computer and use it in GitHub Desktop.
Save Fil/2168253 to your computer and use it in GitHub Desktop.
import seenthis into gensim
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# adapted from ensim-0.8.4-py2.7.egg/gensim/corpora/
# Copyright (C) 2010 Radim Rehurek <>
# Licensed under the GNU LGPL v2.1 -
Convert articles from a Wikipedia dump to (sparse) vectors. The input is a bz2-compressed \
dump of Wikipedia articles, in XML format.
This actually creates three files:
* `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids
* ``: bag-of-words (word counts) representation, in Matrix Matrix format
* ``: TF-IDF representation
The output Matrix Market files can then be compressed (e.g., by bzip2) to save \
disk space; gensim's corpus iterators can work with compressed input, too.
`VOCABULARY_SIZE` controls how many of the most frequent words to keep (after
removing tokens that appear in more than 10%% of all documents). Defaults to 50,000.
If you have the `pattern` package installed, this script will use a fancy lemmatization
to get a lemma of each token (instead of plain alphabetic tokenizer).
Example: ./ ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en
import logging
import itertools
import sys
import os.path
import re
import bz2
import gzip
from gensim import interfaces, matutils, utils
# cannot import whole gensim.corpora, because that imports wikicorpus...
from gensim.corpora.dictionary import Dictionary
from gensim.corpora.textcorpus import TextCorpus
from gensim.corpora.mmcorpus import MmCorpus
logger = logging.getLogger('gensim.corpora.wikicorpus')
# Wiki is first scanned for all distinct word types (~7M). The types that appear
# in more than 10% of articles are removed and from the rest, the DEFAULT_DICT_SIZE
# most frequent types are kept (default 100K).
# Ignore articles shorter than ARTICLE_MIN_CHARS characters (after preprocessing).
# if 'pattern' package is installed, we can use a fancy shallow parsing to get
# token lemmas. otherwise, use simple regexp tokenization
RE_P0 = re.compile('<!--.*?-->', re.DOTALL | re.UNICODE) # comments
RE_P1 = re.compile('<ref([> ].*?)(</ref>|/>)', re.DOTALL | re.UNICODE) # footnotes
RE_P2 = re.compile("(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$", re.UNICODE) # links to languages
RE_P3 = re.compile("{{([^}{]*)}}", re.DOTALL | re.UNICODE) # template
RE_P4 = re.compile("{{([^}]*)}}", re.DOTALL | re.UNICODE) # template
RE_P5 = re.compile('\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE) # remove URL, keep description
RE_P6 = re.compile("\[([^][]*)\|([^][]*)\]", re.DOTALL | re.UNICODE) # simplify links, keep description
RE_P7 = re.compile('\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of images
RE_P8 = re.compile('\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of files
RE_P9 = re.compile('<nowiki([> ].*?)(</nowiki>|/>)', re.DOTALL | re.UNICODE) # outside links
RE_P10 = re.compile('<math([> ].*?)(</math>|/>)', re.DOTALL | re.UNICODE) # math content
RE_P11 = re.compile('<(.*?)>', re.DOTALL | re.UNICODE) # all other tags
RE_P12 = re.compile('\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) # table formatting
RE_P13 = re.compile('\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) # table cell formatting
RE_P14 = re.compile('\[\[Category:[^][]*\]\]', re.UNICODE) # categories
RE_P15 = re.compile('https?\S+\s', re.UNICODE) # URLs
def filter_wiki(raw):
Filter out wiki mark-up from `raw`, leaving only text. `raw` is either unicode
or utf-8 encoded string.
# parsing of the wiki markup is not perfect, but sufficient for our purposes
# contributions to improving this code are welcome :)
text = utils.decode_htmlentities(utils.to_unicode(raw, 'utf8', errors='ignore'))
text = utils.decode_htmlentities(text) # '&amp;nbsp;' --> '\xa0'
return remove_markup(text)
def remove_markup(text):
text = re.sub(RE_P2, "", text) # remove the last list (=languages)
# the wiki markup is recursive (markup inside markup etc)
# instead of writing a recursive grammar, here we deal with that by removing
# markup in a loop, starting with inner-most expressions and working outwards,
# for as long as something changes.
iters = 0
while True:
old, iters = text, iters + 1
text = re.sub(RE_P0, "", text) # remove comments
text = re.sub(RE_P1, '', text) # remove footnotes
text = re.sub(RE_P9, "", text) # remove outside links
text = re.sub(RE_P10, "", text) # remove math content
text = re.sub(RE_P11, "", text) # remove all remaining tags
# remove templates (no recursion)
text = re.sub(RE_P3, '', text)
text = re.sub(RE_P4, '', text)
text = re.sub(RE_P14, '', text) # remove categories
text = re.sub(RE_P15, '', text) # remove URLs
text = re.sub(RE_P5, '\\3', text) # remove urls, keep description
text = re.sub(RE_P7, '\n\\3', text) # simplify images, keep description only
text = re.sub(RE_P8, '\n\\3', text) # simplify files, keep description only
text = re.sub(RE_P6, '\\2', text) # simplify links, keep description only
# remove table markup
text = text.replace('||', '\n|') # each table cell on a separate line
text = re.sub(RE_P12, '\n', text) # remove formatting lines
text = re.sub(RE_P13, '\n\\3', text) # leave only cell content
# remove empty mark-up
text = text.replace('[]', '')
if old == text or iters > 2: # stop if nothing changed between two iterations or after a fixed number of iterations
# the following is needed to make the tokenizer see '[[socialist]]s' as a single word 'socialists'
# TODO is this really desirable?
text = text.replace('[', '').replace(']', '') # promote all remaining markup to plain text
return text
def tokenize(content):
Tokenize a piece of text from wikipedia. The input string `content` is assumed
to be mark-up free (see `filter_wiki()`).
Return list of tokens as utf8 bytestrings. Ignore words shorted than 2 or longer
that 15 characters (not bytes!).
# TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.)
return [token.encode('utf8') for token in utils.tokenize(content, lower=True, errors='ignore')
if 2 <= len(token) <= 15 and not token.startswith('_')]
class WikiCorpus(TextCorpus):
Treat a wikipedia articles dump (*articles.xml.bz2) as a (read-only) corpus.
The documents are extracted on-the-fly, so that the whole (massive) dump
can stay compressed on disk.
>>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h
>>> wiki.saveAsText('wiki_en_vocab200k') # another 8h, creates a file in MatrixMarket format plus file with id->word
def __init__(self, fname, no_below=5, keep_words=DEFAULT_DICT_SIZE, dictionary=None):
Initialize the corpus. This scans the corpus once, to determine its
vocabulary (only the first `keep_words` most frequent words that
appear in at least `noBelow` documents are kept).
self.fname = fname
if dictionary is None:
self.dictionary = Dictionary(self.get_texts())
self.dictionary.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
self.dictionary = dictionary
def get_texts(self, return_raw=False):
Iterate over the dump, returning text version of each article.
Only articles of sufficient length are returned (short articles & redirects
etc are ignored).
Note that this iterates over the **texts**; if you want vectors, just use
the standard corpus interface instead of this function::
>>> for vec in wiki_corpus:
>>> print vec
articles, articles_all = 0, 0
intext, positions = False, 0
lemmatizer = utils.lemmatizer
yielded = 0
for lineno, line in enumerate(gzip.GzipFile(self.fname)):
if line.startswith(' <thread'):
intext = True
line = line[line.find('>') + 1 : ]
lines = [line]
elif intext:
pos = line.find('</thread>') # can be on the same line as <text>
if pos >= 0:
articles_all += 1
intext = False
if not lines:
lines[-1] = line[:pos]
text = filter_wiki(''.join(lines))
if len(text) > ARTICLE_MIN_CHARS: # article redirects are pruned here
articles += 1
if return_raw:
result = text
yield result
_ = lemmatizer.feed(text)
while lemmatizer.has_results():
_, result = # not necessarily the same text as entered above!
positions += len(result)
yielded += 1
yield result
result = tokenize(text) # text into tokens here
positions += len(result)
yield result
if LEMMATIZE:"all %i articles read; waiting for lemmatizer to finish the %i remaining jobs" %
(articles, articles - yielded))
while yielded < articles:
_, result =
positions += len(result)
yielded += 1
yield result"finished iterating over Wikipedia corpus of %i documents with %i positions"
" (total %i articles before pruning)" %
(articles, positions, articles_all))
self.length = articles # cache corpus length
#endclass WikiCorpus
class VocabTransform(interfaces.TransformationABC):
Remap feature ids to new values.
Given a mapping between old ids and new ids (some old ids may be missing = these
features are to be discarded), this will wrap a corpus so that iterating over
`VocabTransform[corpus]` returns the same vectors but with the new ids.
Old features that have no counterpart in the new ids are discarded. This
can be used to filter vocabulary of a corpus "online"::
>>> old2new = dict((oldid, newid) for newid, oldid in enumerate(ids_you_want_to_keep))
>>> vt = VocabTransform(old2new)
>>> for vec_with_new_ids in vt[corpus_with_old_ids]:
>>> ...
def __init__(self, old2new, id2token=None):
# id2word = dict((newid, oldid2word[oldid]) for oldid, newid in old2new.iteritems())
self.old2new = old2new
self.id2token = id2token
def __getitem__(self, bow):
Return representation with the ids transformed.
# if the input vector is in fact a corpus, return a transformed corpus as a result
is_corpus, bow = utils.is_corpus(bow)
if is_corpus:
return self._apply(bow)
return [(self.old2new[oldid], weight) for oldid, weight in bow if oldid in self.old2new]
#endclass VocabTransform
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)"running %s" % ' '.join(sys.argv))
program = os.path.basename(sys.argv[0])
# check and process input arguments
if len(sys.argv) < 3:
print globals()['__doc__'] % locals()
input, output = sys.argv[1:3]
if len(sys.argv) > 3:
keep_words = int(sys.argv[3])
keep_words = DEFAULT_DICT_SIZE
# build dictionary. only keep 100k most frequent words (out of total ~8.2m unique tokens)
# takes about 9h on a macbook pro, for 3.5m articles (june 2011 wiki dump)
wiki = WikiCorpus(input, keep_words=keep_words)
# save dictionary and bag-of-words (term-document frequency matrix)
# another ~9h
wiki.dictionary.save_as_text(output + '_wordids.txt')
MmCorpus.serialize(output + '', wiki, progress_cnt=10000)
del wiki
# initialize corpus reader and word->id mapping
id2token = Dictionary.load_from_text(output + '_wordids.txt')
mm = MmCorpus(output + '')
# build tfidf,
# ~30min
from gensim.models import TfidfModel
tfidf = TfidfModel(mm, id2word=id2token, normalize=True)
# save tfidf vectors in matrix market format
# ~2h; result file is 15GB! bzip2'ed down to 4.5GB
MmCorpus.serialize(output + '', tfidf[mm], progress_cnt=10000)"finished running %s" % program)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment