Created
January 3, 2020 02:29
-
-
Save nguyenvulebinh/5e55cced8c2aa9decb7df2e23c63a585 to your computer and use it in GitHub Desktop.
Parser text from Wikipedia dump file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xml.sax | |
import os | |
import re | |
import subprocess | |
class WikiXmlHandler(xml.sax.handler.ContentHandler): | |
"""Content handler for Wiki XML data using SAX""" | |
@staticmethod | |
def clean_text(text): | |
while re.sub(r'\[\[(((?!\[).)+)\]\]', lambda m: m.group(1).split('|')[-1], text, flags=re.DOTALL) != text: | |
text = re.sub(r'\[\[(((?!\[).)+)\]\]', lambda m: m.group(1).split('|')[-1], text, flags=re.DOTALL) | |
text = re.sub(r"'''(((?!').)*)'''", lambda m: m.group(1), text, flags=re.DOTALL) | |
while re.sub(r"{{(((?![\{\}]).)*)}}", "", text, flags=re.DOTALL) != text: | |
text = re.sub(r"{{(((?![\{\}]).)*)}}", "", text, flags=re.DOTALL) | |
text = re.sub(r"<((?!<).)*\/\s*>", "", text, flags=re.DOTALL) | |
while re.sub(r"<[^<\/]+>((?!<).)*< \/[^>]+>", "", text, flags=re.DOTALL) != text: | |
text = re.sub(r"<[^<\/]+>((?!<).)*< \/[^>]+>", "", text, flags=re.DOTALL) | |
text = re.sub(r"\[((?!\[).)*\]", "", text, flags=re.DOTALL) | |
text = re.sub(r" ''(((?!').)*)'' ", lambda m: ' {} '.format(m.group(1)), text, flags=re.DOTALL) | |
while re.sub(r"{((?![\{\}]).)*}", "", text, flags=re.DOTALL) != text: | |
text = re.sub(r"{((?![\{\}]).)*}", "", text, flags=re.DOTALL) | |
text = re.sub(r"''(((?!'').)*)''", lambda m: m.group(1), text, flags=re.DOTALL) | |
text = re.sub(r"< !--(((?!'').)*)-- >", "", text, flags=re.DOTALL) | |
text = re.sub(r'"\s*(((?!").)*)\s*"', lambda m: '"{}"'.format(m.group(1).strip()), text) | |
text = text.replace('& nbsp;', '') | |
lines = text.split('\n') | |
lines = [re.sub(r'^\s*[*:\-\.–#]+', "", item).strip() for item in lines if len(item.strip()) > 0] | |
lines = [re.sub(r'=+([^=]*)=+', lambda m: '{}'.format(m.group(1).strip()), item).strip() for item in lines if len(item.strip()) > 0] | |
return "\n".join(lines) | |
def __init__(self): | |
xml.sax.handler.ContentHandler.__init__(self) | |
self._buffer = None | |
self._values = {} | |
self._current_tag = None | |
self._pages = [] | |
self.count_page = 0 | |
self.save_dir = './wiki-text' | |
if not os.path.exists(self.save_dir): | |
os.mkdir(self.save_dir) | |
def characters(self, content): | |
"""Characters between opening and closing tags""" | |
if self._current_tag: | |
self._buffer.append(content) | |
def save_bulk_pages(self): | |
with open(os.path.join(self.save_dir, '{}.txt'.format(self.count_page)), 'w', encoding='utf-8') as file_text: | |
for item in self._pages: | |
if ":" in item[0] or \ | |
'redirect' in item[1][:50].lower() or \ | |
'đổi' in item[1][:50].lower(): | |
# print(item[0]) | |
pass | |
else: | |
file_text.write("=!= {} =!=\n\n{}\n\n".format(item[0], item[1])) | |
self._pages = [] | |
def startElement(self, name, attrs): | |
"""Opening tag of element""" | |
if name in ('title', 'text', 'timestamp'): | |
self._current_tag = name | |
self._buffer = [] | |
def endElement(self, name): | |
"""Closing tag of element""" | |
if name == self._current_tag: | |
self._values[name] = ' '.join(self._buffer) | |
if name == 'page': | |
self.count_page += 1 | |
if self.count_page > 1 and (self.count_page % 10000) == 0: | |
print("{:,}".format(self.count_page)) | |
self.save_bulk_pages() | |
# if self._values['title'] == 'Bản mẫu:Lịch tháng này': | |
# print('test') | |
self._pages.append((self._values['title'], WikiXmlHandler.clean_text(self._values['text']))) | |
# Object for handling xml | |
handler = WikiXmlHandler() | |
# Parsing object | |
parser = xml.sax.make_parser() | |
parser.setContentHandler(handler) | |
for i, line in enumerate(subprocess.Popen(['bzcat'], | |
stdin = open('viwiki-20191220-pages-meta-current.xml.bz2'), | |
stdout = subprocess.PIPE).stdout): | |
parser.feed(line) | |
handler.save_bulk_pages() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment