Skip to content

Instantly share code, notes, and snippets.

@buhii
Last active January 4, 2016 18:29
Show Gist options
  • Save buhii/8660433 to your computer and use it in GitHub Desktop.
Save buhii/8660433 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import codecs
import HTMLParser
import sys
import re
import bz2
DEBUG = False
unescape = HTMLParser.HTMLParser().unescape
def multi_strip(candidates):
def inner(text):
for s in candidates:
text = text.replace(s, u" ")
return text
return inner
mediawiki_stripper = multi_strip(u"[]{}|=#*':;!-<>\"")
mediawiki_startwords = (
"__TOC__", "commonsAmpersand",
"wikitext", "text/x-wiki", "text/xwiki", "Indent", '|', 'Weather box', '{'
)
def normalize(text):
old = text[:]
text = unescape(text)
text = re.sub(u"[a-z0-9]{31}", '', text)
text = re.sub(u'(vertical\-align\:\s*top|white\-space|nowrap|text\-align:\s*(left|center|right))', '', text)
text = re.sub(u'\w+\-color\:\#\w+\;', '', text)
text = re.sub(u'(width|auto|rowspan|category|REDIRECT|colspan|background)', '', text)
text = re.sub(u'\'\'\'', '', text)
text = re.sub(u'style=*\;|', '', text)
text = re.sub(u'style=\"*\"', '', text)
text = re.sub(u'align=\"*\"', '', text)
text = re.sub(u'bgcolor=\"*\"', '', text)
text = re.sub(u'class=\"*\"', '', text)
text = re.sub(u'http\://[\w\W^\s]+\s', '', text)
text = re.sub(u'\[\[\:[^<]+?\]\]', '', text)
text = re.sub(u'\[\[ファイル\:[^<]+(\]\]|\|)', '', text)
text = re.sub(u'\[\[ファイル\:', '', text)
text = re.sub(u'ファイル\:[^<]+(svg|png|jpg|gif|SVG|PNG|JPG|GIF)', '', text)
text = re.sub(u'\[\[File[^<\s\S]+?\]\]', '', text)
text = re.sub(u'\[\[File[^<]+?\]\]', '', text)
text = re.sub(u'\[\[File[\s\S]+\.(svg|png|jpg|gif|SVG|PNG|JPG|GIF)', '', text)
text = re.sub(u'\[\[Category[^<]+?\]\]', '', text)
text = re.sub(u'\[\[Image[^<]+?\]\]', '', text)
text = re.sub('{{[^<]+?}}', '', text)
text = re.sub('<[^<]+?>', '', text)
for startwords in mediawiki_startwords:
if text.startswith(startwords): text = u""
text = mediawiki_stripper(text)
text = text.replace(u"\n", u" ")
text = text.replace(u"\r", u" ")
if DEBUG:
return text + u", " + old
else:
return text
if __name__ == "__main__":
if len(sys.argv) > 1:
wiki_xml_path = sys.argv[1]
else:
print "usage: python dump_jawiki.py [jawiki-YYYYMMDD-pages-articles.xml.bz2] [output]"
sys.exit(0)
if len(sys.argv) > 2:
f_out = codecs.open(sys.argv[2], 'w', 'utf-8')
else:
f_out = sys.stdout
# stupid parser
node_page = False
node_ns0 = False
node_text = False
previous_line = u"\n"
for line in bz2.BZ2File(wiki_xml_path):
line = line.strip().decode('utf-8')
if not node_page:
node_page = (line == u"<page>")
if node_page:
node_page = not (line == u"</page>")
if (line == u"</page>") and previous_line != u"\n":
f_out.write(u"\n")
previous_line = u"\n"
if not node_ns0:
node_ns0 = node_page and (line == u"<ns>0</ns>")
if not node_page:
node_ns0 = False
if not node_text:
node_text = node_ns0 and line.startswith(u"<text")
if not node_page:
node_text = False
if node_text:
normalized_line = normalize(line).strip()
if len(normalized_line) > 10:
f_out.write(normalized_line + u" ")
previous_line = normalized_line
f_out.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment