buhii/stupid_jawiki_dumper.py

## stupid_jawiki_dumper.py
# -*- coding: utf-8 -*-
import codecs
import HTMLParser
import sys
import re
import bz2


DEBUG = False


unescape = HTMLParser.HTMLParser().unescape


def multi_strip(candidates):
    def inner(text):
        for s in candidates:
            text = text.replace(s, u" ")
        return text
    return inner


mediawiki_stripper = multi_strip(u"[]{}|=#*':;!-<>\"")
mediawiki_startwords = (
    "__TOC__", "commonsAmpersand",
    "wikitext", "text/x-wiki", "text/xwiki", "Indent", '|', 'Weather box', '{'
)


def normalize(text):
    old = text[:]
    text = unescape(text)
    text = re.sub(u"[a-z0-9]{31}", '', text)
    text = re.sub(u'(vertical\-align\:\s*top|white\-space|nowrap|text\-align:\s*(left|center|right))', '', text)
    text = re.sub(u'\w+\-color\:\#\w+\;', '', text)
    text = re.sub(u'(width|auto|rowspan|category|REDIRECT|colspan|background)', '', text)
    text = re.sub(u'\'\'\'', '', text)
    text = re.sub(u'style=*\;|', '', text)
    text = re.sub(u'style=\"*\"', '', text)
    text = re.sub(u'align=\"*\"', '', text)
    text = re.sub(u'bgcolor=\"*\"', '', text)
    text = re.sub(u'class=\"*\"', '', text)
    text = re.sub(u'http\://[\w\W^\s]+\s', '', text)
    text = re.sub(u'\[\[\:[^<]+?\]\]', '', text)
    text = re.sub(u'\[\[ファイル\:[^<]+(\]\]|\|)', '', text)
    text = re.sub(u'\[\[ファイル\:', '', text)
    text = re.sub(u'ファイル\:[^<]+(svg|png|jpg|gif|SVG|PNG|JPG|GIF)', '', text)
    text = re.sub(u'\[\[File[^<\s\S]+?\]\]', '', text)
    text = re.sub(u'\[\[File[^<]+?\]\]', '', text)
    text = re.sub(u'\[\[File[\s\S]+\.(svg|png|jpg|gif|SVG|PNG|JPG|GIF)', '', text)
    text = re.sub(u'\[\[Category[^<]+?\]\]', '', text)
    text = re.sub(u'\[\[Image[^<]+?\]\]', '', text)
    text = re.sub('{{[^<]+?}}', '', text)
    text = re.sub('<[^<]+?>', '', text)
    for startwords in mediawiki_startwords:
        if text.startswith(startwords): text = u""
    text = mediawiki_stripper(text)
    text = text.replace(u"\n", u" ")
    text = text.replace(u"\r", u" ")

    if DEBUG:
        return text + u", " + old
    else:
        return text


if __name__ == "__main__":
    if len(sys.argv) > 1:
        wiki_xml_path = sys.argv[1]
    else:
        print "usage: python dump_jawiki.py [jawiki-YYYYMMDD-pages-articles.xml.bz2] [output]"
        sys.exit(0)

    if len(sys.argv) > 2:
        f_out = codecs.open(sys.argv[2], 'w', 'utf-8')
    else:
        f_out = sys.stdout

    # stupid parser
    node_page = False
    node_ns0 = False
    node_text = False

    previous_line = u"\n"

    for line in bz2.BZ2File(wiki_xml_path):
        line = line.strip().decode('utf-8')

        if not node_page:
            node_page = (line == u"<page>")
        if node_page:
            node_page = not (line == u"</page>")

        if (line == u"</page>") and previous_line != u"\n":
            f_out.write(u"\n")
            previous_line = u"\n"

        if not node_ns0:
            node_ns0 = node_page and (line == u"<ns>0</ns>")
        if not node_page:
            node_ns0 = False

        if not node_text:
            node_text = node_ns0 and line.startswith(u"<text")
        if not node_page:
            node_text = False

        if node_text:
            normalized_line = normalize(line).strip()
            if len(normalized_line) > 10:
                f_out.write(normalized_line + u" ")
                previous_line = normalized_line

    f_out.close()
	# -- coding: utf-8 --
	import codecs
	import HTMLParser
	import sys
	import re
	import bz2


	DEBUG = False


	unescape = HTMLParser.HTMLParser().unescape


	def multi_strip(candidates):
	def inner(text):
	for s in candidates:
	text = text.replace(s, u" ")
	return text
	return inner


	mediawiki_stripper = multi_strip(u"[]{}\|=#*':;!-<>\"")
	mediawiki_startwords = (
	"__TOC__", "commonsAmpersand",
	"wikitext", "text/x-wiki", "text/xwiki", "Indent", '\|', 'Weather box', '{'
	)


	def normalize(text):
	old = text[:]
	text = unescape(text)
	text = re.sub(u"[a-z0-9]{31}", '', text)
	text = re.sub(u'(vertical\-align\:\stop\|white\-space\|nowrap\|text\-align:\s(left\|center\|right))', '', text)
	text = re.sub(u'\w+\-color\:\#\w+\;', '', text)
	text = re.sub(u'(width\|auto\|rowspan\|category\|REDIRECT\|colspan\|background)', '', text)
	text = re.sub(u'\'\'\'', '', text)
	text = re.sub(u'style=*\;\|', '', text)
	text = re.sub(u'style=\"*\"', '', text)
	text = re.sub(u'align=\"*\"', '', text)
	text = re.sub(u'bgcolor=\"*\"', '', text)
	text = re.sub(u'class=\"*\"', '', text)
	text = re.sub(u'http\://[\w\W^\s]+\s', '', text)
	text = re.sub(u'\[\[\:[^<]+?\]\]', '', text)
	text = re.sub(u'\[\[ファイル\:[^<]+(\]\]\|\\|)', '', text)
	text = re.sub(u'\[\[ファイル\:', '', text)
	text = re.sub(u'ファイル\:[^<]+(svg\|png\|jpg\|gif\|SVG\|PNG\|JPG\|GIF)', '', text)
	text = re.sub(u'\[\[File[^<\s\S]+?\]\]', '', text)
	text = re.sub(u'\[\[File[^<]+?\]\]', '', text)
	text = re.sub(u'\[\[File[\s\S]+\.(svg\|png\|jpg\|gif\|SVG\|PNG\|JPG\|GIF)', '', text)
	text = re.sub(u'\[\[Category[^<]+?\]\]', '', text)
	text = re.sub(u'\[\[Image[^<]+?\]\]', '', text)
	text = re.sub('{{[^<]+?}}', '', text)
	text = re.sub('<[^<]+?>', '', text)
	for startwords in mediawiki_startwords:
	if text.startswith(startwords): text = u""
	text = mediawiki_stripper(text)
	text = text.replace(u"\n", u" ")
	text = text.replace(u"\r", u" ")

	if DEBUG:
	return text + u", " + old
	else:
	return text


	if __name__ == "__main__":
	if len(sys.argv) > 1:
	wiki_xml_path = sys.argv[1]
	else:
	print "usage: python dump_jawiki.py [jawiki-YYYYMMDD-pages-articles.xml.bz2] [output]"
	sys.exit(0)

	if len(sys.argv) > 2:
	f_out = codecs.open(sys.argv[2], 'w', 'utf-8')
	else:
	f_out = sys.stdout

	# stupid parser
	node_page = False
	node_ns0 = False
	node_text = False

	previous_line = u"\n"

	for line in bz2.BZ2File(wiki_xml_path):
	line = line.strip().decode('utf-8')

	if not node_page:
	node_page = (line == u"<page>")
	if node_page:
	node_page = not (line == u"</page>")

	if (line == u"</page>") and previous_line != u"\n":
	f_out.write(u"\n")
	previous_line = u"\n"

	if not node_ns0:
	node_ns0 = node_page and (line == u"<ns>0</ns>")
	if not node_page:
	node_ns0 = False

	if not node_text:
	node_text = node_ns0 and line.startswith(u"<text")
	if not node_page:
	node_text = False

	if node_text:
	normalized_line = normalize(line).strip()
	if len(normalized_line) > 10:
	f_out.write(normalized_line + u" ")
	previous_line = normalized_line

	f_out.close()