Last active
January 4, 2016 18:29
-
-
Save buhii/8660433 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import codecs | |
import HTMLParser | |
import sys | |
import re | |
import bz2 | |
DEBUG = False | |
unescape = HTMLParser.HTMLParser().unescape | |
def multi_strip(candidates): | |
def inner(text): | |
for s in candidates: | |
text = text.replace(s, u" ") | |
return text | |
return inner | |
mediawiki_stripper = multi_strip(u"[]{}|=#*':;!-<>\"") | |
mediawiki_startwords = ( | |
"__TOC__", "commonsAmpersand", | |
"wikitext", "text/x-wiki", "text/xwiki", "Indent", '|', 'Weather box', '{' | |
) | |
def normalize(text): | |
old = text[:] | |
text = unescape(text) | |
text = re.sub(u"[a-z0-9]{31}", '', text) | |
text = re.sub(u'(vertical\-align\:\s*top|white\-space|nowrap|text\-align:\s*(left|center|right))', '', text) | |
text = re.sub(u'\w+\-color\:\#\w+\;', '', text) | |
text = re.sub(u'(width|auto|rowspan|category|REDIRECT|colspan|background)', '', text) | |
text = re.sub(u'\'\'\'', '', text) | |
text = re.sub(u'style=*\;|', '', text) | |
text = re.sub(u'style=\"*\"', '', text) | |
text = re.sub(u'align=\"*\"', '', text) | |
text = re.sub(u'bgcolor=\"*\"', '', text) | |
text = re.sub(u'class=\"*\"', '', text) | |
text = re.sub(u'http\://[\w\W^\s]+\s', '', text) | |
text = re.sub(u'\[\[\:[^<]+?\]\]', '', text) | |
text = re.sub(u'\[\[ファイル\:[^<]+(\]\]|\|)', '', text) | |
text = re.sub(u'\[\[ファイル\:', '', text) | |
text = re.sub(u'ファイル\:[^<]+(svg|png|jpg|gif|SVG|PNG|JPG|GIF)', '', text) | |
text = re.sub(u'\[\[File[^<\s\S]+?\]\]', '', text) | |
text = re.sub(u'\[\[File[^<]+?\]\]', '', text) | |
text = re.sub(u'\[\[File[\s\S]+\.(svg|png|jpg|gif|SVG|PNG|JPG|GIF)', '', text) | |
text = re.sub(u'\[\[Category[^<]+?\]\]', '', text) | |
text = re.sub(u'\[\[Image[^<]+?\]\]', '', text) | |
text = re.sub('{{[^<]+?}}', '', text) | |
text = re.sub('<[^<]+?>', '', text) | |
for startwords in mediawiki_startwords: | |
if text.startswith(startwords): text = u"" | |
text = mediawiki_stripper(text) | |
text = text.replace(u"\n", u" ") | |
text = text.replace(u"\r", u" ") | |
if DEBUG: | |
return text + u", " + old | |
else: | |
return text | |
if __name__ == "__main__": | |
if len(sys.argv) > 1: | |
wiki_xml_path = sys.argv[1] | |
else: | |
print "usage: python dump_jawiki.py [jawiki-YYYYMMDD-pages-articles.xml.bz2] [output]" | |
sys.exit(0) | |
if len(sys.argv) > 2: | |
f_out = codecs.open(sys.argv[2], 'w', 'utf-8') | |
else: | |
f_out = sys.stdout | |
# stupid parser | |
node_page = False | |
node_ns0 = False | |
node_text = False | |
previous_line = u"\n" | |
for line in bz2.BZ2File(wiki_xml_path): | |
line = line.strip().decode('utf-8') | |
if not node_page: | |
node_page = (line == u"<page>") | |
if node_page: | |
node_page = not (line == u"</page>") | |
if (line == u"</page>") and previous_line != u"\n": | |
f_out.write(u"\n") | |
previous_line = u"\n" | |
if not node_ns0: | |
node_ns0 = node_page and (line == u"<ns>0</ns>") | |
if not node_page: | |
node_ns0 = False | |
if not node_text: | |
node_text = node_ns0 and line.startswith(u"<text") | |
if not node_page: | |
node_text = False | |
if node_text: | |
normalized_line = normalize(line).strip() | |
if len(normalized_line) > 10: | |
f_out.write(normalized_line + u" ") | |
previous_line = normalized_line | |
f_out.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment