baojie/WikiExtractor.py

## WikiExtractor.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# =============================================================================
#  Version: 2.4 (April 2, 2013)
#  Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
#	   Antonio Fuschetto (fuschett@di.unipi.it), University of Pisa
#
#  Contributors:
#	Leonardo Souza (lsouza@amtera.com.br)
#	Juan Manuel Caicedo (juan@cavorite.com)
#	Humberto Pereira (begini@gmail.com)
#	Siegfried-A. Gevatter (siegfried@gevatter.com), 2013
#
# =============================================================================
#  Copyright (c) 2009. Giuseppe Attardi (attardi@di.unipi.it).
# =============================================================================
#  This file is part of Tanl.
#
#  Tanl is free software; you can redistribute it and/or modify it
#  under the terms of the GNU General Public License, version 3,
#  as published by the Free Software Foundation.
#
#  Tanl is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
# =============================================================================

"""Wikipedia Extractor:
Extracts and cleans text from Wikipedia database dump and stores output in a
number of files of similar size in a given directory.
Each file contains several documents in Tanl document format:
	<doc id="" url="" title="">
        ...
        </doc>

Usage:
  WikiExtractor.py [options]

Options:
  -c, --compress        : compress output files using bzip
  -b, --bytes= n[KM]    : put specified bytes per output file (default 500K)
  -B, --base= URL       : base URL for the Wikipedia pages
  -l, --link            : preserve links
  -n NS, --ns NS        : accepted namespaces (separated by commas)
  -o, --output= dir     : place output files in specified directory (default
                          current)
  -s, --sections	: preserve sections
  -h, --help            : display this help and exit
"""

import sys
import gc
import getopt
import urllib
import re
import bz2
import os.path
from htmlentitydefs import name2codepoint

### PARAMS ####################################################################

# This is obtained from the dump itself
prefix = None

##
# Whether to preseve links in output
#
keepLinks = False

##
# Whether to transform sections into HTML
#
keepSections = False

##
# Recognize only these namespaces
# w: Internal links to the Wikipedia
#
acceptedNamespaces = set(['w'])

##
# Drop these elements from article text
#
discardElements = set([
        'gallery', 'timeline', 'noinclude', 'pre',
        'table', 'tr', 'td', 'th', 'caption',
        'form', 'input', 'select', 'option', 'textarea',
        'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir',
        'ref', 'references', 'img', 'imagemap', 'source'
        ])

#=========================================================================
#
# MediaWiki Markup Grammar

# Template = "{{" [ "msg:" | "msgnw:" ] PageName { "|" [ ParameterName "=" AnyText | AnyText ] } "}}" ;
# Extension = "<" ? extension ? ">" AnyText "</" ? extension ? ">" ;
# NoWiki = "<nowiki />" | "<nowiki>" ( InlineText | BlockText ) "</nowiki>" ;
# Parameter = "{{{" ParameterName { Parameter } [ "|" { AnyText | Parameter } ] "}}}" ;
# Comment = "<!--" InlineText "-->" | "<!--" BlockText "//-->" ;
#
# ParameterName = ? uppercase, lowercase, numbers, no spaces, some special chars ? ;
#
#===========================================================================

# Program version
version = '2.3'


##### Main function ###########################################################

def WikiDocument(out, id, title, text):
    url = get_url(id, prefix)
    header = '<doc id="%s" url="%s" title="%s">\n' % (id, url, title)
    # Separate header from text with a newline.
    header += title + '\n'
    header = header.encode('utf-8')
    text = clean(text)
    footer = "\n</doc>"
    out.reserve(len(header) + len(text) + len(footer))
    print >> out, header
    for line in compact(text):
        print >> out, line.encode('utf-8')
    print >> out, footer

def get_url(id, prefix):
    return "%s?curid=%s" % (prefix, id)

#------------------------------------------------------------------------------

selfClosingTags = set([ 'br', 'hr', 'nobr', 'ref', 'references' ])

ignoredTags = set([
        'a', 'b', 'big', 'blockquote', 'center', 'cite', 'div', 'em',
        'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd', 'nowiki',
        'p', 'plaintext', 's', 'small', 'span', 'strike', 'strong',
        'sub', 'sup', 'tt', 'u', 'var',
])

placeholder_tags = {'math':'formula', 'code':'codice'}

##
# Normalize title
def normalizeTitle(title):
  # remove leading whitespace and underscores
  title = title.strip(' _')
  # replace sequences of whitespace and underscore chars with a single space
  title = re.compile(r'[\s_]+').sub(' ', title)

  m = re.compile(r'([^:]*):(\s*)(\S(?:.*))').match(title)
  if m:
      prefix = m.group(1)
      if m.group(2):
          optionalWhitespace = ' '
      else:
          optionalWhitespace = ''
      rest = m.group(3)

      ns = prefix.capitalize()
      if ns in acceptedNamespaces:
          # If the prefix designates a known namespace, then it might be
          # followed by optional whitespace that should be removed to get
          # the canonical page name
          # (e.g., "Category:  Births" should become "Category:Births").
          title = ns + ":" + rest.capitalize()
      else:
          # No namespace, just capitalize first letter.
	  # If the part before the colon is not a known namespace, then we must
          # not remove the space after the colon (if any), e.g.,
          # "3001: The_Final_Odyssey" != "3001:The_Final_Odyssey".
          # However, to get the canonical page name we must contract multiple
          # spaces into one, because
          # "3001:   The_Final_Odyssey" != "3001: The_Final_Odyssey".
          title = prefix.capitalize() + ":" + optionalWhitespace + rest
  else:
      # no namespace, just capitalize first letter
      title = title.capitalize();
  return title

##
# Removes HTML or XML character references and entities from a text string.
#
# @param text The HTML (or XML) source text.
# @return The plain text, as a Unicode string, if necessary.

def unescape(text):
    def fixup(m):
        text = m.group(0)
        code = m.group(1)
        try:
            if text[1] == "#":  # character reference
                if text[2] == "x":
                    return unichr(int(code[1:], 16))
                else:
                    return unichr(int(code))
            else:               # named entity
                return unichr(name2codepoint[code])
        except:
            return text # leave as is

    return re.sub("&#?(\w+);", fixup, text)

# Match HTML comments
comment = re.compile(r'<!--.*?-->', re.DOTALL)

# Match elements to ignore
discard_element_patterns = []
for tag in discardElements:
    pattern = re.compile(r'<\s*%s\b[^>]*>.*?<\s*/\s*%s>' % (tag, tag), re.DOTALL | re.IGNORECASE)
    discard_element_patterns.append(pattern)

# Match ignored tags
ignored_tag_patterns = []
for tag in ignoredTags:
    left = re.compile(r'<\s*%s\b[^>]*>' % tag, re.IGNORECASE)
    right = re.compile(r'<\s*/\s*%s>' % tag, re.IGNORECASE)
    ignored_tag_patterns.append((left, right))

# Match selfClosing HTML tags
selfClosing_tag_patterns = []
for tag in selfClosingTags:
    pattern = re.compile(r'<\s*%s\b[^/]*/\s*>' % tag, re.DOTALL | re.IGNORECASE)
    selfClosing_tag_patterns.append(pattern)

# Match HTML placeholder tags
placeholder_tag_patterns = []
for tag, repl in placeholder_tags.items():
    pattern = re.compile(r'<\s*%s(\s*| [^>]+?)>.*?<\s*/\s*%s\s*>' % (tag, tag), re.DOTALL | re.IGNORECASE)
    placeholder_tag_patterns.append((pattern, repl))

# Match preformatted lines
preformatted = re.compile(r'^ .*?$', re.MULTILINE)

# Match external links (space separates second optional parameter)
externalLink = re.compile(r'\[\w+.*? (.*?)\]')
externalLinkNoAnchor = re.compile(r'\[\w+[&\]]*\]')

# Matches bold/italic
bold_italic = re.compile(r"'''''([^']*?)'''''")
bold = re.compile(r"'''(.*?)'''")
italic_quote = re.compile(r"''\"(.*?)\"''")
italic = re.compile(r"''([^']*)''")
quote_quote = re.compile(r'""(.*?)""')

# Matches space
spaces = re.compile(r' {2,}')

# Matches dots
dots = re.compile(r'\.{4,}')

# A matching function for nested expressions, e.g. namespaces and tables.
def dropNested(text, openDelim, closeDelim):
    openRE = re.compile(openDelim)
    closeRE = re.compile(closeDelim)
    # partition text in separate blocks { } { }
    matches = []                # pairs (s, e) for each partition
    nest = 0                    # nesting level
    start = openRE.search(text, 0)
    if not start:
        return text
    end = closeRE.search(text, start.end())
    next = start
    while end:
        next = openRE.search(text, next.end())
        if not next:            # termination
            while nest:         # close all pending
                nest -=1
                end0 = closeRE.search(text, end.end())
                if end0:
                    end = end0
                else:
                    break
            matches.append((start.start(), end.end()))
            break
        while end.end() < next.start():
            # { } {
            if nest:
                nest -= 1
                # try closing more
                last = end.end()
                end = closeRE.search(text, end.end())
                if not end:     # unbalanced
                    if matches:
                        span = (matches[0][0], last)
                    else:
                        span = (start.start(), last)
                    matches = [span]
                    break
            else:
                matches.append((start.start(), end.end()))
                # advance start, find next close
                start = next
                end = closeRE.search(text, next.end())
                break           # { }
        if next != start:
            # { { }
            nest += 1
    # collect text outside partitions
    res = ''
    start = 0
    for s, e in  matches:
        res += text[start:s]
        start = e
    res += text[start:]
    return res

def dropSpans(matches, text):
    """Drop from text the blocks identified in matches"""
    matches.sort()
    res = ''
    start = 0
    for s, e in  matches:
        res += text[start:s]
        start = e
    res += text[start:]
    return res

# Match interwiki links, | separates parameters.
# First parameter is displayed, also trailing concatenated text included
# in display, e.g. s for plural).
#
# Can be nested [[File:..|..[[..]]..|..]], [[Category:...]], etc.
# We first expand inner ones, than remove enclosing ones.
#
wikiLink = re.compile(r'\[\[([^[]*?)(?:\|([^[]*?))?\]\](\w*)')

parametrizedLink = re.compile(r'\[\[.*?\]\]')

# Function applied to wikiLinks
def make_anchor_tag(match):
    global keepLinks
    link = match.group(1)
    colon = link.find(':')
    if colon > 0 and link[:colon] not in acceptedNamespaces:
        return ''
    trail = match.group(3)
    anchor = match.group(2)
    if not anchor:
        anchor = link
    anchor += trail
    if keepLinks:
        return '<a href="%s">%s</a>' % (link, anchor)
    else:
        return anchor

def clean(text):

    # FIXME: templates should be expanded
    # Drop transclusions (template, parser functions)
    # See: http://www.mediawiki.org/wiki/Help:Templates
    text = dropNested(text, r'{{', r'}}')

    # Drop tables
    text = dropNested(text, r'{\|', r'\|}')

    # Expand links
    text = wikiLink.sub(make_anchor_tag, text)
    # Drop all remaining ones
    text = parametrizedLink.sub('', text)

    # Handle external links
    text = externalLink.sub(r'\1', text)
    text = externalLinkNoAnchor.sub('', text)

    # Handle bold/italic/quote
    text = bold_italic.sub(r'\1', text)
    text = bold.sub(r'\1', text)
    text = italic_quote.sub(r'&quot;\1&quot;', text)
    text = italic.sub(r'&quot;\1&quot;', text)
    text = quote_quote.sub(r'\1', text)
    text = text.replace("'''", '').replace("''", '&quot;')

    ################ Process HTML ###############

    # turn into HTML
    text = unescape(text)
    # do it again (&amp;nbsp;)
    text = unescape(text)

    # Collect spans

    matches = []
    # Drop HTML comments
    for m in comment.finditer(text):
            matches.append((m.start(), m.end()))

    # Drop self-closing tags
    for pattern in selfClosing_tag_patterns:
        for m in pattern.finditer(text):
            matches.append((m.start(), m.end()))

    # Drop ignored tags
    for left, right in ignored_tag_patterns:
        for m in left.finditer(text):
            matches.append((m.start(), m.end()))
        for m in right.finditer(text):
            matches.append((m.start(), m.end()))

    # Bulk remove all spans
    text = dropSpans(matches, text)

    # Cannot use dropSpan on these since they may be nested
    # Drop discarded elements
    for pattern in discard_element_patterns:
        text = pattern.sub('', text)

    # Expand placeholders
    for pattern, placeholder in placeholder_tag_patterns:
        index = 1
        for match in pattern.finditer(text):
            text = text.replace(match.group(), '%s_%d' % (placeholder, index))
            index += 1

    text = text.replace('<<', u'Â«').replace('>>', u'Â»')

    #############################################

    # Drop preformatted
    # This can't be done before since it may remove tags
    text = preformatted.sub('', text)

    # Cleanup text
    text = text.replace('\t', ' ')
    text = spaces.sub(' ', text)
    text = dots.sub('...', text)
    text = re.sub(u' (,:\.\)\]Â»)', r'\1', text)
    text = re.sub(u'(\[\(Â«) ', r'\1', text)
    text = re.sub(r'\n\W+?\n', '\n', text) # lines with only punctuations
    text = text.replace(',,', ',').replace(',.', '.')
    return text

section = re.compile(r'(==+)\s*(.*?)\s*\1')

def compact(text):
    """Deal with headers, lists, empty sections, residuals of tables"""
    page = []                   # list of paragraph
    headers = {}                # Headers for unfilled sections
    emptySection = False        # empty sections are discarded
    inList = False              # whether opened <UL>

    for line in text.split('\n'):

        if not line:
            continue
        # Handle section titles
        m = section.match(line)
        if m:
            title = m.group(2)
            lev = len(m.group(1))
            if keepSections:
                page.append("<h%d>%s</h%d>" % (lev, title, lev))
            if title and title[-1] not in '!?':
                title += '.'
            headers[lev] = title
            # drop previous headers
            for i in headers.keys():
                if i > lev:
                    del headers[i]
            emptySection = True
            continue
        # Handle page title
        if line.startswith('++'):
            title = line[2:-2]
            if title:
                if title[-1] not in '!?':
                    title += '.'
                page.append(title)
        # handle lists
        elif line[-1] == ':' or line[0] in '*#:;':
            if keepSections:
                page.append("<li>%s</li>" % line[1:])
            else:
                continue
        # Drop residuals of lists
        elif line[0] in '{|' or line[-1] in '}':
            continue
        # Drop irrelevant lines
        elif (line[0] == '(' and line[-1] == ')') or line.strip('.-') == '':
            continue
        elif len(headers):
            items = headers.items()
            items.sort()
            for (i, v) in items:
                page.append(v)
            headers.clear()
            page.append(line)   # first line
            emptySection = False
        elif not emptySection:
            page.append(line)

    return page

def handle_unicode(entity):
    numeric_code = int(entity[2:-1])
    if numeric_code >= 0x10000: return ''
    return unichr(numeric_code)

#------------------------------------------------------------------------------

class OutputSplitter:
    def __init__(self, compress, max_file_size, path_name):
        self.dir_index = 0
        self.file_index = -1
        self.compress = compress
        self.max_file_size = max_file_size
        self.path_name = path_name
        self.out_file = self.open_next_file()

    def reserve(self, size):
        cur_file_size = self.out_file.tell()
        if cur_file_size + size > self.max_file_size:
            self.close()
            self.out_file = self.open_next_file()

    def write(self, text):
        self.out_file.write(text)

    def close(self):
        self.out_file.close()

    def open_next_file(self):
        self.file_index += 1
        if self.file_index == 100:
            self.dir_index += 1
            self.file_index = 0
        dir_name = self.dir_name()
        if not os.path.isdir(dir_name):
            os.makedirs(dir_name)
        file_name = os.path.join(dir_name, self.file_name())
        if self.compress:
            return bz2.BZ2File(file_name + '.bz2', 'w')
        else:
            return open(file_name, 'w')

    def dir_name(self):
        char1 = self.dir_index % 26
        char2 = self.dir_index / 26 % 26
        return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))

    def file_name(self):
        return 'wiki_%02d' % self.file_index

### READER ###################################################################

tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?')

def process_data(input, output):
    global prefix

    page = []
    id = None
    inText = False
    redirect = False
    for line in input:
        line = line.decode('utf-8')
        tag = ''
        if '<' in line:
            m = tagRE.search(line)
            if m:
                tag = m.group(2)
        if tag == 'page':
            page = []
            redirect = False
        elif tag == 'id' and not id:
            id = m.group(3)
        elif tag == 'title':
            title = m.group(3)
        elif tag == 'redirect':
            redirect = True
        elif tag == 'text':
            inText = True
            line = line[m.start(3):m.end(3)] + '\n'
            page.append(line)
            if m.lastindex == 4: # open-close
                inText = False
        elif tag == '/text':
            if m.group(1):
                page.append(m.group(1) + '\n')
            inText = False
        elif inText:
            page.append(line)
        elif tag == '/page':
            colon = title.find(':')
            if (colon < 0 or title[:colon] in acceptedNamespaces) and \
                    not redirect:
                print id, title.encode('utf-8')
                sys.stdout.flush()
                WikiDocument(output, id, title, ''.join(page))
            id = None
            page = []
        elif tag == 'base':
            # discover prefix from the xml dump file
            # /mediawiki/siteinfo/base
            base = m.group(3)
            prefix = base[:base.rfind("/")]

### CL INTERFACE ############################################################

def show_help():
    print >> sys.stdout, __doc__,

def show_usage(script_name):
    print >> sys.stderr, 'Usage: %s [options]' % script_name

##
# Minimum size of output files
minFileSize = 200 * 1024

def main():
    global keepLinks, keepSections, prefix, acceptedNamespaces
    script_name = os.path.basename(sys.argv[0])

    try:
        long_opts = ['help', 'compress', 'bytes=', 'basename=', 'links', 'ns=', 'sections', 'output=', 'version']
        opts, args = getopt.gnu_getopt(sys.argv[1:], 'cb:hln:o:B:sv', long_opts)
    except getopt.GetoptError:
        show_usage(script_name)
        sys.exit(1)

    compress = False
    file_size = 500 * 1024
    output_dir = '.'

    for opt, arg in opts:
        if opt in ('-h', '--help'):
            show_help()
            sys.exit()
        elif opt in ('-c', '--compress'):
            compress = True
        elif opt in ('-l', '--links'):
            keepLinks = True
        elif opt in ('-s', '--sections'):
            keepSections = True
        elif opt in ('-B', '--base'):
            prefix = arg
        elif opt in ('-b', '--bytes'):
            try:
                if arg[-1] in 'kK':
                    file_size = int(arg[:-1]) * 1024
                elif arg[-1] in 'mM':
                    file_size = int(arg[:-1]) * 1024 * 1024
                else:
                    file_size = int(arg)
                if file_size < minFileSize: raise ValueError()
            except ValueError:
                print >> sys.stderr, \
                '%s: %s: Insufficient or invalid size' % (script_name, arg)
                sys.exit(2)
        elif opt in ('-n', '--ns'):
                acceptedNamespaces = set(arg.split(','))
        elif opt in ('-o', '--output'):
                output_dir = arg
        elif opt in ('-v', '--version'):
                print 'WikiExtractor.py version:', version
                sys.exit(0)

    if len(args) > 0:
        show_usage(script_name)
        sys.exit(4)

    if not os.path.isdir(output_dir):
        try:
            os.makedirs(output_dir)
        except:
            print >> sys.stderr, 'Could not create: ', output_dir
            return

    output_splitter = OutputSplitter(compress, file_size, output_dir)
    process_data(sys.stdin, output_splitter)
    output_splitter.close()

if __name__ == '__main__':
    main()

## WikiExtractorMultithread.py


#!/usr/bin/python
# -*- coding: utf-8 -*-

# =============================================================================
#  Multithread-Wikipedia-Extractor
#  For SMP based architectures
#  Version: 1.0 (October 15, 2012)
# =============================================================================
#  Copyright (c) 2012. Leonardo Souza (leonardossz@gmail.com).
# =============================================================================

# =============================================================================
#  This a modified version of the orginal Wikipedia Extractor by
#  Giuseppe Attardi (attardi@di.unipi.it), University of Pisa and
#  Antonio Fuschetto (fuschett@di.unipi.it), University of Pisa, the
#  orginal work can be found at http://medialab.di.unipi.it/wiki/Wikipedia_Extractor
# =============================================================================
#  Copyright (c) 2009. Giuseppe Attardi (attardi@di.unipi.it).
# =============================================================================
#
#  multithread-wikipedia-extractor is a free software;
#  you can redistribute it and/or modify it under the
#  terms of the GNU General Public License, version 3,
#  as published by the Free Software Foundation.
#
#  multithread-wikipedia-extractor is distributed in the hope
#  that it will be useful,  but WITHOUT ANY WARRANTY; without
#  even the implied warranty of  MERCHANTABILITY or FITNESS
#  FOR A PARTICULAR PURPOSE.  See the GNU General Public License
#  for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
# =============================================================================

"""
Multithread Wikipedia Extractor:

Extracts and cleans text from Wikipedia database dump and stores output in a
number of files of similar size in a given directory.
Each file contains several documents in the format:

	<doc id="" url="" title="">
        ...
        </doc>
"""

import Queue, threading, argparse, shutil, json
import sys, re, bz2, multiprocessing
import os.path, os, string, random, traceback
from htmlentitydefs import name2codepoint
from lxml import etree

# compatible with the original work from the TANL project
# see http://medialab.di.unipi.it/wiki/Tanl for more info
TANL = "tanl"
# outputs json objects
JSON = "json"

class WikiCleanerThread(threading.Thread):

    _filename_lock = threading.RLock()

    def __init__(self, queue, outputdir, maxfilesize, prefix, compress, output_format):
        threading.Thread.__init__(self)
        self._queue = queue
        self._maxfilesize = maxfilesize
        self._prefix = prefix
        self._compress = compress
        self._outputdir = outputdir
        self._output_format = output_format
        if not os.path.exists(outputdir):
            os.mkdir(outputdir)
        self._outfile = None

    @classmethod
    def _get_file(cls, outputdir, compress=False):
        with cls._filename_lock:
            fpath = None
            while not fpath or os.path.exists(fpath):
                fname = ''.join([random.choice(string.letters) for _ in range(16)])
                ext = ".raw" if not compress else ".raw.bz2"
                fpath = os.path.join(outputdir, fname + ext)

            if compress:
                return bz2.BZ2File(fpath, 'w')

            return open(fpath, 'w')

    def _geturl(self, wiki_id):
        return "%s?curid=%s" % (self._prefix, wiki_id)

    def _write(self, wiki_id, wiki_title, wiki_text):
        if not self._outfile:
            self._outfile = self._get_file(self._outputdir, self._compress)

        print "[%s] [%s]" % (wiki_id.encode('utf-8'), wiki_title.encode('utf-8'))

        url = self._geturl(wiki_id)

        if self._output_format == TANL:
            header = '<doc id="%s" url="%s" title="%s">%s\n' % (wiki_id, url, wiki_title, wiki_title)
            body = ' '.join(compact(clean(wiki_text))).strip()
            footer = "\n</doc>"
            self._outfile.write(header.encode("utf-8"))
            self._outfile.write(body.encode("utf-8"))
            self._outfile.write(footer.encode("utf-8"))

        elif self._output_format == JSON:
            article = dict(id=wiki_id, url=url, title=wiki_title, text=wiki_text)
            self._outfile.write(json.dumps(article, encoding='utf-8') + '\n')

        if self._outfile.tell() > self._maxfilesize:
            self._outfile.close()
            self._outfile = None

    def _clean(self, page_elem):
        # wiki xml dumps has namespace
        # use xmlns from the page element
        def TAG(tag):
            return page_elem.tag.split("page")[0] + tag

        wiki_id = page_elem.find(TAG("id")).text.strip()
        wiki_title = page_elem.find(TAG("title")).text.strip()
        revision_elem = page_elem.find(TAG("revision"))
        if revision_elem is not None:
            text_elem = revision_elem.find(TAG("text"))
            if text_elem is not None:
                wiki_text = text_elem.text.strip()
                self._write(wiki_id, wiki_title, wiki_text)

    def run(self):
        while True:
            page_elem = None
            try:
                page_elem = self._queue.get(timeout=1)
                if page_elem is not None:
                    self._clean(page_elem)
            except Queue.Empty:
                break
            except:
                traceback.print_exc(file=sys.stdout)
            finally:
                if page_elem is not None:
                    page_elem.clear()
                    self._queue.task_done()

        print "%s done" % self.name

##
# Whether to preseve links in output
#
keepLinks = False

##
# Whether to transform sections into HTML
#
keepSections = False

##
# Recognize onlyy these namespaces
#
acceptedNamespaces = set([
])

##
# Drop these elements from article text
#
discardElements = set([
        'gallery', 'timeline', 'noinclude', 'pre',
        'table', 'tr', 'td', 'th', 'caption',
        'form', 'input', 'select', 'option', 'textarea',
        'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir',
        'ref', 'references', 'img', 'imagemap', 'source'
        ])

#=========================================================================
#
# MediaWiki Markup Grammar

# Template = "{{" [ "msg:" | "msgnw:" ] PageName { "|" [ ParameterName "=" AnyText | AnyText ] } "}}" ;
# Extension = "<" ? extension ? ">" AnyText "</" ? extension ? ">" ;
# NoWiki = "<nowiki />" | "<nowiki>" ( InlineText | BlockText ) "</nowiki>" ;
# Parameter = "{{{" ParameterName { Parameter } [ "|" { AnyText | Parameter } ] "}}}" ;
# Comment = "<!--" InlineText "-->" | "<!--" BlockText "//-->" ;
#
# ParameterName = ? uppercase, lowercase, numbers, no spaces, some special chars ? ;
#
#===========================================================================


selfClosingTags = set([ 'br', 'hr', 'nobr', 'ref', 'references' ])

ignoredTags = set([
        'a', 'b', 'big', 'blockquote', 'center', 'cite', 'div', 'em',
        'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd', 'nowiki',
        'p', 'plaintext', 's', 'small', 'span', 'strike', 'strong',
        'sub', 'sup', 'tt', 'u', 'var',
])

placeholder_tags = {'math':'formula', 'code':'codice'}

##
# Normalize title
def normalizeTitle(title):
  # remove leading whitespace and underscores
  title = title.strip(' _')
  # replace sequences of whitespace and underscore chars with a single space
  title = re.compile(r'[\s_]+').sub(' ', title)

  m = re.compile(r'([^:]*):(\s*)(\S(?:.*))').match(title)
  if m:
      prefix = m.group(1)
      if m.group(2):
          optionalWhitespace = ' '
      else:
          optionalWhitespace = ''
      rest = m.group(3)

      ns = prefix.capitalize()
      if ns in acceptedNamespaces:
          # If the prefix designates a known namespace, then it might be
          # followed by optional whitespace that should be removed to get
          # the canonical page name
          # (e.g., "Category:  Births" should become "Category:Births").
          title = ns + ":" + rest.capitalize()
      else:
          # No namespace, just capitalize first letter.
	  # If the part before the colon is not a known namespace, then we must
          # not remove the space after the colon (if any), e.g.,
          # "3001: The_Final_Odyssey" != "3001:The_Final_Odyssey".
          # However, to get the canonical page name we must contract multiple
          # spaces into one, because
          # "3001:   The_Final_Odyssey" != "3001: The_Final_Odyssey".
          title = prefix.capitalize() + ":" + optionalWhitespace + rest
  else:
      # no namespace, just capitalize first letter
      title = title.capitalize();
  return title

##
# Removes HTML or XML character references and entities from a text string.
#
# @param text The HTML (or XML) source text.
# @return The plain text, as a Unicode string, if necessary.

def unescape(text):
    def fixup(m):
        text = m.group(0)
        code = m.group(1)
        try:
            if text[1] == "#":  # character reference
                if text[2] == "x":
                    return unichr(int(code[1:], 16))
                else:
                    return unichr(int(code))
            else:               # named entity
                return unichr(name2codepoint[code])
        except:
            return text # leave as is

    return re.sub("&#?(\w+);", fixup, text)

# Match HTML comments
comment = re.compile(r'<!--.*?-->', re.DOTALL)

# Match elements to ignore
discard_element_patterns = []
for tag in discardElements:
    pattern = re.compile(r'<%s[^>]*>.*?</%s>' % (tag, tag), re.DOTALL | re.IGNORECASE)
    discard_element_patterns.append(pattern)

# Match ignored tags
ignored_tag_patterns = []
for tag in ignoredTags:
    left = re.compile(r'<%s[^/]*>' % tag, re.IGNORECASE)
    right = re.compile(r'</%s>' % tag, re.IGNORECASE)
    ignored_tag_patterns.append((left, right))

# Match selfClosing HTML tags
selfClosing_tag_patterns = []
for tag in selfClosingTags:
    pattern = re.compile(r'<%s[^/]*/\s*>' % tag, re.DOTALL | re.IGNORECASE)
    selfClosing_tag_patterns.append(pattern)

# Match HTML placeholder tags
placeholder_tag_patterns = []
for tag, repl in placeholder_tags.items():
    pattern = re.compile(r'<\s*%s(\s*| [^/]+?)>.*?<\s*/\s*%s\s*>' % (tag, tag), re.DOTALL | re.IGNORECASE)
    placeholder_tag_patterns.append((pattern, repl))

# Match preformatted lines
preformatted = re.compile(r'^ .*?$', re.MULTILINE)

# Match external links (space separates second optional parameter)
externalLink = re.compile(r'\[\w+.*? (.*?)\]')
externalLinkNoAnchor = re.compile(r'\[\w+[&\]]*\]')

# Matches bold/italic
bold_italic = re.compile(r"'''''([^']*?)'''''")
bold = re.compile(r"'''(.*?)'''")
italic_quote = re.compile(r"''\"(.*?)\"''")
italic = re.compile(r"''([^']*)''")
quote_quote = re.compile(r'""(.*?)""')

# Matches space
spaces = re.compile(r' {2,}')

# Matches dots
dots = re.compile(r'\.{4,}')

# A matching function for nested expressions, e.g. namespaces and tables.
def dropNested(text, openDelim, closeDelim):
    openRE = re.compile(openDelim)
    closeRE = re.compile(closeDelim)
    # partition text in separate blocks { } { }
    matches = []                # pairs (s, e) for each partition
    nest = 0                    # nesting level
    start = openRE.search(text, 0)
    if not start:
        return text
    end = closeRE.search(text, start.end())
    next = start
    while end:
        next = openRE.search(text, next.end())
        if not next:            # termination
            while nest:         # close all pending
                nest -=1
                end0 = closeRE.search(text, end.end())
                if end0:
                    end = end0
                else:
                    break
            matches.append((start.start(), end.end()))
            break
        while end.end() < next.start():
            # { } {
            if nest:
                nest -= 1
                # try closing more
                last = end.end()
                end = closeRE.search(text, end.end())
                if not end:     # unbalanced
                    if matches:
                        span = (matches[0][0], last)
                    else:
                        span = (start.start(), last)
                    matches = [span]
                    break
            else:
                matches.append((start.start(), end.end()))
                # advance start, find next close
                start = next
                end = closeRE.search(text, next.end())
                break           # { }
        if next != start:
            # { { }
            nest += 1
    # collect text outside partitions
    res = ''
    start = 0
    for s, e in  matches:
        res += text[start:s]
        start = e
    res += text[start:]
    return res

def dropSpans(matches, text):
    """Drop from text the blocks identified in matches"""
    matches.sort()
    res = ''
    start = 0
    for s, e in  matches:
        res += text[start:s]
        start = e
    res += text[start:]
    return res

# Match interwiki links, | separates parameters.
# First parameter is displayed, also trailing concatenated text included
# in display, e.g. s for plural).
#
# Can be nested [[File:..|..[[..]]..|..]], [[Category:...]], etc.
# We first expand inner ones, than remove enclosing ones.
#
wikiLink = re.compile(r'\[\[([^[]*?)(?:\|([^[]*?))?\]\](\w*)')

parametrizedLink = re.compile(r'\[\[.*?\]\]')

# Function applied to wikiLinks
def make_anchor_tag(match):
    global keepLinks
    link = match.group(1)
    colon = link.find(':')
    if colon > 0 and link[:colon] not in acceptedNamespaces:
        return ''
    trail = match.group(3)
    anchor = match.group(2)
    if not anchor:
        anchor = link
    anchor += trail
    if keepLinks:
        return '<a href="%s">%s</a>' % (link, anchor)
    else:
        return anchor

def clean(text):

    # FIXME: templates should be expanded
    # Drop transclusions (template, parser functions)
    # See: http://www.mediawiki.org/wiki/Help:Templates
    text = dropNested(text, r'{{', r'}}')

    # Drop tables
    text = dropNested(text, r'{\|', r'\|}')

    # Drop preformatted
    text = preformatted.sub('', text)

    # Expand links
    text = wikiLink.sub(make_anchor_tag, text)
    # Drop all remaining ones
    text = parametrizedLink.sub('', text)

    # Handle external links
    text = externalLink.sub(r'\1', text)
    text = externalLinkNoAnchor.sub('', text)

    # Handle bold/italic/quote
    text = bold_italic.sub(r'\1', text)
    text = bold.sub(r'\1', text)
    text = italic_quote.sub(r'&quot;\1&quot;', text)
    text = italic.sub(r'&quot;\1&quot;', text)
    text = quote_quote.sub(r'\1', text)
    text = text.replace("'''", '').replace("''", '&quot;')

    ################ Process HTML ###############

    # turn into HTML
    text = unescape(text)
    # do it again (&amp;nbsp;)
    text = unescape(text)

    # Collect spans

    matches = []
    # Drop HTML comments
    for m in comment.finditer(text):
            matches.append((m.start(), m.end()))

    # Drop self-closing tags
    for pattern in selfClosing_tag_patterns:
        for m in pattern.finditer(text):
            matches.append((m.start(), m.end()))

    # Drop ignored tags
    for left, right in ignored_tag_patterns:
        for m in left.finditer(text):
            matches.append((m.start(), m.end()))
        for m in right.finditer(text):
            matches.append((m.start(), m.end()))

    # Bulk remove all spans
    text = dropSpans(matches, text)

    # Cannot use dropSpan on these since they may be nested
    # Drop discarded elements
    for pattern in discard_element_patterns:
        text = pattern.sub('', text)

    # Expand placeholders
    for pattern, placeholder in placeholder_tag_patterns:
        index = 1
        for match in pattern.finditer(text):
            text = text.replace(match.group(), '%s_%d' % (placeholder, index))
            index += 1

    text = text.replace('<<', u'Â«').replace('>>', u'Â»')

    # Cleanup text
    text = text.replace('\t', ' ')
    text = spaces.sub(' ', text)
    text = dots.sub('...', text)
    text = re.sub(u' (,:\.\)\]Â»)', r'\1', text)
    text = re.sub(u'(\[\(Â«) ', r'\1', text)
    text = re.sub(r'\n\W+?\n', '\n', text) # lines with only punctuations
    text = text.replace(',,', ',').replace(',.', '.')
    return text

section = re.compile(r'(==+)\s*(.*?)\s*\1')

def compact(text):
    """Deal with headers, lists, empty sections, residuals of tables"""
    page = []                   # list of paragraph
    headers = {}                # Headers for unfilled sections
    emptySection = False        # empty sections are discarded

    for line in text.split('\n'):

        if not line:
            continue
        # Handle section titles
        m = section.match(line)
        if m:
            title = m.group(2)
            lev = len(m.group(1))
            if keepSections:
                page.append("<h%d>%s</h%d>" % (lev, title, lev))
            if title and title[-1] not in '!?':
                title += '.'
            headers[lev] = title
            # drop previous headers
            for i in headers.keys():
                if i > lev:
                    del headers[i]
            emptySection = True
            continue
        # Handle page title
        if line.startswith('++'):
            title = line[2:-2]
            if title:
                if title[-1] not in '!?':
                    title += '.'
                page.append(title)
        # handle lists
        elif line[-1] == ':' or line[0] in '*#:;':
            if keepSections:
                page.append("<li>%s</li>" % line[1:])
            else:
                continue
        # Drop residuals of lists
        elif line[0] in '{|' or line[-1] in '}':
            continue
        # Drop irrelevant lines
        elif (line[0] == '(' and line[-1] == ')') or line.strip('.-') == '':
            continue
        elif len(headers):
            items = headers.items()
            items.sort()
            for (i, v) in items:
                page.append(v)
            headers.clear()
            page.append(line)   # first line
            emptySection = False
        elif not emptySection:
            page.append(line)

    return page

def handle_unicode(entity):
    numeric_code = int(entity[2:-1])
    if numeric_code >= 0x10000: return ''
    return unichr(numeric_code)

def process_data(inputdump, outputdir, maxfilesize, compress, outformat):

    # we expects large dumps so we are using iterparse method
    context = etree.iterparse(inputdump)
    context = iter(context)

    # discover prefix from the xml dump file
    # /mediawiki/siteinfo/base
    prefix = None
    for event, elem in context:
        if event == "end" and elem.tag.endswith("base"):
            prefix =  elem.text[:elem.text.rfind("/")]
            break
    print "base url: %s" % prefix

    # initialize wiki page queue
    queue = Queue.Queue(maxsize=100)

    # start worker threads
    workers = []
    for _ in range(multiprocessing.cpu_count()):
        cleaner = WikiCleanerThread(queue, outputdir, maxfilesize, prefix, compress, outformat)
        cleaner.setDaemon(True)
        cleaner.start()
        workers.append(cleaner)

    # put element pages in the queue to be processed by the cleaner threads
    for event, elem in context:
        if event == "end" and elem.tag.endswith("page"):
            queue.put(elem)

    # wait an empty queue
    queue.join()

    for w in workers:
        w.join()

    print "finished"

def main():
    global keepLinks, keepSections

    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__)
    parser.add_argument("wikidump", help="XML wiki dump file")
    parser.add_argument("outputdir", help="output directory")
    parser.add_argument("-w", "--overwrite", default=False, action="store_const", const=True, help="Overwrite existing output dir")
    parser.add_argument("-b", "--bytes", default="25M", help="put specified bytes per output file (default is %(default)s)", metavar="n[KM]")
    parser.add_argument("-c", "--compress", default=False, action="store_const", const=True, help="compress output files using bzip")
    parser.add_argument("-l", "--links", default=False, action="store_const", const=True, help="preserve links")
    parser.add_argument("-s", "--sections", default=False, action="store_const", const=True, help="preserve sections")
    parser.add_argument("-f", "--format", choices=(TANL, JSON), default=JSON, help="choose output format default is %(default)s")

    args = parser.parse_args()

    keepLinks = args.links
    keepSections = args.sections

   # Minimum size of output files
    min_file_size = 200 * 1024
    try:
        if args.bytes[-1] in 'kK':
            file_size = int(args.bytes[:-1]) * 1024
        elif args.bytes[-1] in 'mM':
            file_size = int(args.bytes[:-1]) * 1024 * 1024
        else:
            file_size = int(args.bytes)
        if file_size < min_file_size: raise ValueError()
    except ValueError:
        print >> sys.stderr, \
        'Insufficient or invalid bytes size (minimum per output is %d bytes)' \
        % min_file_size
        return

    if not os.path.exists(args.outputdir):
        os.makedirs(args.outputdir)
    else:
        if args.overwrite:
            shutil.rmtree(args.outputdir)
            os.makedirs(args.outputdir)
        else:
            raise ValueError("%s already exists, use --overwrite to recreate" % args.outputdir)

    if args.wikidump.lower().endswith("bz2"):
        with bz2.BZ2File(args.wikidump, 'r') as inputdump:
            process_data(inputdump, args.outputdir, file_size, args.compress, args.format.lower())
    else:
        with open(args.wikidump, 'r') as inputdump:
            process_data(inputdump, args.outputdir, file_size, args.compress, args.format.lower())


if __name__ == '__main__':
    main()
	#!/usr/bin/python
	# -- coding: utf-8 --
	#
	# =============================================================================
	# Version: 2.4 (April 2, 2013)
	# Author: Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
	# Antonio Fuschetto (fuschett@di.unipi.it), University of Pisa
	#
	# Contributors:
	# Leonardo Souza (lsouza@amtera.com.br)
	# Juan Manuel Caicedo (juan@cavorite.com)
	# Humberto Pereira (begini@gmail.com)
	# Siegfried-A. Gevatter (siegfried@gevatter.com), 2013
	#
	# =============================================================================
	# Copyright (c) 2009. Giuseppe Attardi (attardi@di.unipi.it).
	# =============================================================================
	# This file is part of Tanl.
	#
	# Tanl is free software; you can redistribute it and/or modify it
	# under the terms of the GNU General Public License, version 3,
	# as published by the Free Software Foundation.
	#
	# Tanl is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <http://www.gnu.org/licenses/>.
	# =============================================================================

	"""Wikipedia Extractor:
	Extracts and cleans text from Wikipedia database dump and stores output in a
	number of files of similar size in a given directory.
	Each file contains several documents in Tanl document format:
	<doc id="" url="" title="">
	...
	</doc>

	Usage:
	WikiExtractor.py [options]

	Options:
	-c, --compress : compress output files using bzip
	-b, --bytes= n[KM] : put specified bytes per output file (default 500K)
	-B, --base= URL : base URL for the Wikipedia pages
	-l, --link : preserve links
	-n NS, --ns NS : accepted namespaces (separated by commas)
	-o, --output= dir : place output files in specified directory (default
	current)
	-s, --sections : preserve sections
	-h, --help : display this help and exit
	"""

	import sys
	import gc
	import getopt
	import urllib
	import re
	import bz2
	import os.path
	from htmlentitydefs import name2codepoint

	### PARAMS ####################################################################

	# This is obtained from the dump itself
	prefix = None

	##
	# Whether to preseve links in output
	#
	keepLinks = False

	##
	# Whether to transform sections into HTML
	#
	keepSections = False

	##
	# Recognize only these namespaces
	# w: Internal links to the Wikipedia
	#
	acceptedNamespaces = set(['w'])

	##
	# Drop these elements from article text
	#
	discardElements = set([
	'gallery', 'timeline', 'noinclude', 'pre',
	'table', 'tr', 'td', 'th', 'caption',
	'form', 'input', 'select', 'option', 'textarea',
	'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir',
	'ref', 'references', 'img', 'imagemap', 'source'
	])

	#=========================================================================
	#
	# MediaWiki Markup Grammar

	# Template = "{{" [ "msg:" \| "msgnw:" ] PageName { "\|" [ ParameterName "=" AnyText \| AnyText ] } "}}" ;
	# Extension = "<" ? extension ? ">" AnyText "</" ? extension ? ">" ;
	# NoWiki = "<nowiki />" \| "<nowiki>" ( InlineText \| BlockText ) "</nowiki>" ;
	# Parameter = "{{{" ParameterName { Parameter } [ "\|" { AnyText \| Parameter } ] "}}}" ;
	# Comment = "<!--" InlineText "-->" \| "<!--" BlockText "//-->" ;
	#
	# ParameterName = ? uppercase, lowercase, numbers, no spaces, some special chars ? ;
	#
	#===========================================================================

	# Program version
	version = '2.3'


	##### Main function ###########################################################

	def WikiDocument(out, id, title, text):
	url = get_url(id, prefix)
	header = '<doc id="%s" url="%s" title="%s">\n' % (id, url, title)
	# Separate header from text with a newline.
	header += title + '\n'
	header = header.encode('utf-8')
	text = clean(text)
	footer = "\n</doc>"
	out.reserve(len(header) + len(text) + len(footer))
	print >> out, header
	for line in compact(text):
	print >> out, line.encode('utf-8')
	print >> out, footer

	def get_url(id, prefix):
	return "%s?curid=%s" % (prefix, id)

	#------------------------------------------------------------------------------

	selfClosingTags = set([ 'br', 'hr', 'nobr', 'ref', 'references' ])

	ignoredTags = set([
	'a', 'b', 'big', 'blockquote', 'center', 'cite', 'div', 'em',
	'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd', 'nowiki',
	'p', 'plaintext', 's', 'small', 'span', 'strike', 'strong',
	'sub', 'sup', 'tt', 'u', 'var',
	])

	placeholder_tags = {'math':'formula', 'code':'codice'}

	##
	# Normalize title
	def normalizeTitle(title):
	# remove leading whitespace and underscores
	title = title.strip(' _')
	# replace sequences of whitespace and underscore chars with a single space
	title = re.compile(r'[\s_]+').sub(' ', title)

	m = re.compile(r'([^:]):(\s)(\S(?:.*))').match(title)
	if m:
	prefix = m.group(1)
	if m.group(2):
	optionalWhitespace = ' '
	else:
	optionalWhitespace = ''
	rest = m.group(3)

	ns = prefix.capitalize()
	if ns in acceptedNamespaces:
	# If the prefix designates a known namespace, then it might be
	# followed by optional whitespace that should be removed to get
	# the canonical page name
	# (e.g., "Category: Births" should become "Category:Births").
	title = ns + ":" + rest.capitalize()
	else:
	# No namespace, just capitalize first letter.
	# If the part before the colon is not a known namespace, then we must
	# not remove the space after the colon (if any), e.g.,
	# "3001: The_Final_Odyssey" != "3001:The_Final_Odyssey".
	# However, to get the canonical page name we must contract multiple
	# spaces into one, because
	# "3001: The_Final_Odyssey" != "3001: The_Final_Odyssey".
	title = prefix.capitalize() + ":" + optionalWhitespace + rest
	else:
	# no namespace, just capitalize first letter
	title = title.capitalize();
	return title

	##
	# Removes HTML or XML character references and entities from a text string.
	#
	# @param text The HTML (or XML) source text.
	# @return The plain text, as a Unicode string, if necessary.

	def unescape(text):
	def fixup(m):
	text = m.group(0)
	code = m.group(1)
	try:
	if text[1] == "#": # character reference
	if text[2] == "x":
	return unichr(int(code[1:], 16))
	else:
	return unichr(int(code))
	else: # named entity
	return unichr(name2codepoint[code])
	except:
	return text # leave as is

	return re.sub("&#?(\w+);", fixup, text)

	# Match HTML comments
	comment = re.compile(r'<!--.*?-->', re.DOTALL)

	# Match elements to ignore
	discard_element_patterns = []
	for tag in discardElements:
	pattern = re.compile(r'<\s%s\b[^>]>.?<\s/\s*%s>' % (tag, tag), re.DOTALL \| re.IGNORECASE)
	discard_element_patterns.append(pattern)

	# Match ignored tags
	ignored_tag_patterns = []
	for tag in ignoredTags:
	left = re.compile(r'<\s%s\b[^>]>' % tag, re.IGNORECASE)
	right = re.compile(r'<\s/\s%s>' % tag, re.IGNORECASE)
	ignored_tag_patterns.append((left, right))

	# Match selfClosing HTML tags
	selfClosing_tag_patterns = []
	for tag in selfClosingTags:
	pattern = re.compile(r'<\s%s\b[^/]/\s*>' % tag, re.DOTALL \| re.IGNORECASE)
	selfClosing_tag_patterns.append(pattern)

	# Match HTML placeholder tags
	placeholder_tag_patterns = []
	for tag, repl in placeholder_tags.items():
	pattern = re.compile(r'<\s%s(\s\| [^>]+?)>.?<\s/\s%s\s>' % (tag, tag), re.DOTALL \| re.IGNORECASE)
	placeholder_tag_patterns.append((pattern, repl))

	# Match preformatted lines
	preformatted = re.compile(r'^ .*?$', re.MULTILINE)

	# Match external links (space separates second optional parameter)
	externalLink = re.compile(r'\[\w+.? (.?)\]')
	externalLinkNoAnchor = re.compile(r'\[\w+[&\]]*\]')

	# Matches bold/italic
	bold_italic = re.compile(r"'''''([^']*?)'''''")
	bold = re.compile(r"'''(.*?)'''")
	italic_quote = re.compile(r"''\"(.*?)\"''")
	italic = re.compile(r"''([^']*)''")
	quote_quote = re.compile(r'""(.*?)""')

	# Matches space
	spaces = re.compile(r' {2,}')

	# Matches dots
	dots = re.compile(r'\.{4,}')

	# A matching function for nested expressions, e.g. namespaces and tables.
	def dropNested(text, openDelim, closeDelim):
	openRE = re.compile(openDelim)
	closeRE = re.compile(closeDelim)
	# partition text in separate blocks { } { }
	matches = [] # pairs (s, e) for each partition
	nest = 0 # nesting level
	start = openRE.search(text, 0)
	if not start:
	return text
	end = closeRE.search(text, start.end())
	next = start
	while end:
	next = openRE.search(text, next.end())
	if not next: # termination
	while nest: # close all pending
	nest -=1
	end0 = closeRE.search(text, end.end())
	if end0:
	end = end0
	else:
	break
	matches.append((start.start(), end.end()))
	break
	while end.end() < next.start():
	# { } {
	if nest:
	nest -= 1
	# try closing more
	last = end.end()
	end = closeRE.search(text, end.end())
	if not end: # unbalanced
	if matches:
	span = (matches[0][0], last)
	else:
	span = (start.start(), last)
	matches = [span]
	break
	else:
	matches.append((start.start(), end.end()))
	# advance start, find next close
	start = next
	end = closeRE.search(text, next.end())
	break # { }
	if next != start:
	# { { }
	nest += 1
	# collect text outside partitions
	res = ''
	start = 0
	for s, e in matches:
	res += text[start:s]
	start = e
	res += text[start:]
	return res

	def dropSpans(matches, text):
	"""Drop from text the blocks identified in matches"""
	matches.sort()
	res = ''
	start = 0
	for s, e in matches:
	res += text[start:s]
	start = e
	res += text[start:]
	return res

	# Match interwiki links, \| separates parameters.
	# First parameter is displayed, also trailing concatenated text included
	# in display, e.g. s for plural).
	#
	# Can be nested [[File:..\|..[[..]]..\|..]], [[Category:...]], etc.
	# We first expand inner ones, than remove enclosing ones.
	#
	wikiLink = re.compile(r'\[\[([^[]?)(?:\\|([^[]?))?\]\](\w*)')

	parametrizedLink = re.compile(r'\[\[.*?\]\]')

	# Function applied to wikiLinks
	def make_anchor_tag(match):
	global keepLinks
	link = match.group(1)
	colon = link.find(':')
	if colon > 0 and link[:colon] not in acceptedNamespaces:
	return ''
	trail = match.group(3)
	anchor = match.group(2)
	if not anchor:
	anchor = link
	anchor += trail
	if keepLinks:
	return '<a href="%s">%s</a>' % (link, anchor)
	else:
	return anchor

	def clean(text):

	# FIXME: templates should be expanded
	# Drop transclusions (template, parser functions)
	# See: http://www.mediawiki.org/wiki/Help:Templates
	text = dropNested(text, r'{{', r'}}')

	# Drop tables
	text = dropNested(text, r'{\\|', r'\\|}')

	# Expand links
	text = wikiLink.sub(make_anchor_tag, text)
	# Drop all remaining ones
	text = parametrizedLink.sub('', text)

	# Handle external links
	text = externalLink.sub(r'\1', text)
	text = externalLinkNoAnchor.sub('', text)

	# Handle bold/italic/quote
	text = bold_italic.sub(r'\1', text)
	text = bold.sub(r'\1', text)
	text = italic_quote.sub(r'"\1"', text)
	text = italic.sub(r'"\1"', text)
	text = quote_quote.sub(r'\1', text)
	text = text.replace("'''", '').replace("''", '"')

	################ Process HTML ###############

	# turn into HTML
	text = unescape(text)
	# do it again (&nbsp;)
	text = unescape(text)

	# Collect spans

	matches = []
	# Drop HTML comments
	for m in comment.finditer(text):
	matches.append((m.start(), m.end()))

	# Drop self-closing tags
	for pattern in selfClosing_tag_patterns:
	for m in pattern.finditer(text):
	matches.append((m.start(), m.end()))

	# Drop ignored tags
	for left, right in ignored_tag_patterns:
	for m in left.finditer(text):
	matches.append((m.start(), m.end()))
	for m in right.finditer(text):
	matches.append((m.start(), m.end()))

	# Bulk remove all spans
	text = dropSpans(matches, text)

	# Cannot use dropSpan on these since they may be nested
	# Drop discarded elements
	for pattern in discard_element_patterns:
	text = pattern.sub('', text)

	# Expand placeholders
	for pattern, placeholder in placeholder_tag_patterns:
	index = 1
	for match in pattern.finditer(text):
	text = text.replace(match.group(), '%s_%d' % (placeholder, index))
	index += 1

	text = text.replace('<<', u'Â«').replace('>>', u'Â»')

	#############################################

	# Drop preformatted
	# This can't be done before since it may remove tags
	text = preformatted.sub('', text)

	# Cleanup text
	text = text.replace('\t', ' ')
	text = spaces.sub(' ', text)
	text = dots.sub('...', text)
	text = re.sub(u' (,:\.\)\]Â»)', r'\1', text)
	text = re.sub(u'(\[\(Â«) ', r'\1', text)
	text = re.sub(r'\n\W+?\n', '\n', text) # lines with only punctuations
	text = text.replace(',,', ',').replace(',.', '.')
	return text

	section = re.compile(r'(==+)\s(.?)\s*\1')

	def compact(text):
	"""Deal with headers, lists, empty sections, residuals of tables"""
	page = [] # list of paragraph
	headers = {} # Headers for unfilled sections
	emptySection = False # empty sections are discarded
	inList = False # whether opened <UL>

	for line in text.split('\n'):

	if not line:
	continue
	# Handle section titles
	m = section.match(line)
	if m:
	title = m.group(2)
	lev = len(m.group(1))
	if keepSections:
	page.append("<h%d>%s</h%d>" % (lev, title, lev))
	if title and title[-1] not in '!?':
	title += '.'
	headers[lev] = title
	# drop previous headers
	for i in headers.keys():
	if i > lev:
	del headers[i]
	emptySection = True
	continue
	# Handle page title
	if line.startswith('++'):
	title = line[2:-2]
	if title:
	if title[-1] not in '!?':
	title += '.'
	page.append(title)
	# handle lists
	elif line[-1] == ':' or line[0] in '*#:;':
	if keepSections:
	page.append("<li>%s</li>" % line[1:])
	else:
	continue
	# Drop residuals of lists
	elif line[0] in '{\|' or line[-1] in '}':
	continue
	# Drop irrelevant lines
	elif (line[0] == '(' and line[-1] == ')') or line.strip('.-') == '':
	continue
	elif len(headers):
	items = headers.items()
	items.sort()
	for (i, v) in items:
	page.append(v)
	headers.clear()
	page.append(line) # first line
	emptySection = False
	elif not emptySection:
	page.append(line)

	return page

	def handle_unicode(entity):
	numeric_code = int(entity[2:-1])
	if numeric_code >= 0x10000: return ''
	return unichr(numeric_code)

	#------------------------------------------------------------------------------

	class OutputSplitter:
	def __init__(self, compress, max_file_size, path_name):
	self.dir_index = 0
	self.file_index = -1
	self.compress = compress
	self.max_file_size = max_file_size
	self.path_name = path_name
	self.out_file = self.open_next_file()

	def reserve(self, size):
	cur_file_size = self.out_file.tell()
	if cur_file_size + size > self.max_file_size:
	self.close()
	self.out_file = self.open_next_file()

	def write(self, text):
	self.out_file.write(text)

	def close(self):
	self.out_file.close()

	def open_next_file(self):
	self.file_index += 1
	if self.file_index == 100:
	self.dir_index += 1
	self.file_index = 0
	dir_name = self.dir_name()
	if not os.path.isdir(dir_name):
	os.makedirs(dir_name)
	file_name = os.path.join(dir_name, self.file_name())
	if self.compress:
	return bz2.BZ2File(file_name + '.bz2', 'w')
	else:
	return open(file_name, 'w')

	def dir_name(self):
	char1 = self.dir_index % 26
	char2 = self.dir_index / 26 % 26
	return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))

	def file_name(self):
	return 'wiki_%02d' % self.file_index

	### READER ###################################################################

	tagRE = re.compile(r'(.?)<(/?\w+)[^>]>(?:([^<])(<.?>)?)?')

	def process_data(input, output):
	global prefix

	page = []
	id = None
	inText = False
	redirect = False
	for line in input:
	line = line.decode('utf-8')
	tag = ''
	if '<' in line:
	m = tagRE.search(line)
	if m:
	tag = m.group(2)
	if tag == 'page':
	page = []
	redirect = False
	elif tag == 'id' and not id:
	id = m.group(3)
	elif tag == 'title':
	title = m.group(3)
	elif tag == 'redirect':
	redirect = True
	elif tag == 'text':
	inText = True
	line = line[m.start(3):m.end(3)] + '\n'
	page.append(line)
	if m.lastindex == 4: # open-close
	inText = False
	elif tag == '/text':
	if m.group(1):
	page.append(m.group(1) + '\n')
	inText = False
	elif inText:
	page.append(line)
	elif tag == '/page':
	colon = title.find(':')
	if (colon < 0 or title[:colon] in acceptedNamespaces) and \
	not redirect:
	print id, title.encode('utf-8')
	sys.stdout.flush()
	WikiDocument(output, id, title, ''.join(page))
	id = None
	page = []
	elif tag == 'base':
	# discover prefix from the xml dump file
	# /mediawiki/siteinfo/base
	base = m.group(3)
	prefix = base[:base.rfind("/")]

	### CL INTERFACE ############################################################

	def show_help():
	print >> sys.stdout, __doc__,

	def show_usage(script_name):
	print >> sys.stderr, 'Usage: %s [options]' % script_name

	##
	# Minimum size of output files
	minFileSize = 200 * 1024

	def main():
	global keepLinks, keepSections, prefix, acceptedNamespaces
	script_name = os.path.basename(sys.argv[0])

	try:
	long_opts = ['help', 'compress', 'bytes=', 'basename=', 'links', 'ns=', 'sections', 'output=', 'version']
	opts, args = getopt.gnu_getopt(sys.argv[1:], 'cb:hln:o:B:sv', long_opts)
	except getopt.GetoptError:
	show_usage(script_name)
	sys.exit(1)

	compress = False
	file_size = 500 * 1024
	output_dir = '.'

	for opt, arg in opts:
	if opt in ('-h', '--help'):
	show_help()
	sys.exit()
	elif opt in ('-c', '--compress'):
	compress = True
	elif opt in ('-l', '--links'):
	keepLinks = True
	elif opt in ('-s', '--sections'):
	keepSections = True
	elif opt in ('-B', '--base'):
	prefix = arg
	elif opt in ('-b', '--bytes'):
	try:
	if arg[-1] in 'kK':
	file_size = int(arg[:-1]) * 1024
	elif arg[-1] in 'mM':
	file_size = int(arg[:-1]) * 1024 * 1024
	else:
	file_size = int(arg)
	if file_size < minFileSize: raise ValueError()
	except ValueError:
	print >> sys.stderr, \
	'%s: %s: Insufficient or invalid size' % (script_name, arg)
	sys.exit(2)
	elif opt in ('-n', '--ns'):
	acceptedNamespaces = set(arg.split(','))
	elif opt in ('-o', '--output'):
	output_dir = arg
	elif opt in ('-v', '--version'):
	print 'WikiExtractor.py version:', version
	sys.exit(0)

	if len(args) > 0:
	show_usage(script_name)
	sys.exit(4)

	if not os.path.isdir(output_dir):
	try:
	os.makedirs(output_dir)
	except:
	print >> sys.stderr, 'Could not create: ', output_dir
	return

	output_splitter = OutputSplitter(compress, file_size, output_dir)
	process_data(sys.stdin, output_splitter)
	output_splitter.close()

	if __name__ == '__main__':
	main()