Skip to content

Instantly share code, notes, and snippets.

@nguyenvulebinh
Created January 3, 2020 02:29
Show Gist options
  • Save nguyenvulebinh/5e55cced8c2aa9decb7df2e23c63a585 to your computer and use it in GitHub Desktop.
Save nguyenvulebinh/5e55cced8c2aa9decb7df2e23c63a585 to your computer and use it in GitHub Desktop.
Parser text from Wikipedia dump file
import xml.sax
import os
import re
import subprocess
class WikiXmlHandler(xml.sax.handler.ContentHandler):
"""Content handler for Wiki XML data using SAX"""
@staticmethod
def clean_text(text):
while re.sub(r'\[\[(((?!\[).)+)\]\]', lambda m: m.group(1).split('|')[-1], text, flags=re.DOTALL) != text:
text = re.sub(r'\[\[(((?!\[).)+)\]\]', lambda m: m.group(1).split('|')[-1], text, flags=re.DOTALL)
text = re.sub(r"'''(((?!').)*)'''", lambda m: m.group(1), text, flags=re.DOTALL)
while re.sub(r"{{(((?![\{\}]).)*)}}", "", text, flags=re.DOTALL) != text:
text = re.sub(r"{{(((?![\{\}]).)*)}}", "", text, flags=re.DOTALL)
text = re.sub(r"<((?!<).)*\/\s*>", "", text, flags=re.DOTALL)
while re.sub(r"<[^<\/]+>((?!<).)*< \/[^>]+>", "", text, flags=re.DOTALL) != text:
text = re.sub(r"<[^<\/]+>((?!<).)*< \/[^>]+>", "", text, flags=re.DOTALL)
text = re.sub(r"\[((?!\[).)*\]", "", text, flags=re.DOTALL)
text = re.sub(r" ''(((?!').)*)'' ", lambda m: ' {} '.format(m.group(1)), text, flags=re.DOTALL)
while re.sub(r"{((?![\{\}]).)*}", "", text, flags=re.DOTALL) != text:
text = re.sub(r"{((?![\{\}]).)*}", "", text, flags=re.DOTALL)
text = re.sub(r"''(((?!'').)*)''", lambda m: m.group(1), text, flags=re.DOTALL)
text = re.sub(r"< !--(((?!'').)*)-- >", "", text, flags=re.DOTALL)
text = re.sub(r'"\s*(((?!").)*)\s*"', lambda m: '"{}"'.format(m.group(1).strip()), text)
text = text.replace('& nbsp;', '')
lines = text.split('\n')
lines = [re.sub(r'^\s*[*:\-\.–#]+', "", item).strip() for item in lines if len(item.strip()) > 0]
lines = [re.sub(r'=+([^=]*)=+', lambda m: '{}'.format(m.group(1).strip()), item).strip() for item in lines if len(item.strip()) > 0]
return "\n".join(lines)
def __init__(self):
xml.sax.handler.ContentHandler.__init__(self)
self._buffer = None
self._values = {}
self._current_tag = None
self._pages = []
self.count_page = 0
self.save_dir = './wiki-text'
if not os.path.exists(self.save_dir):
os.mkdir(self.save_dir)
def characters(self, content):
"""Characters between opening and closing tags"""
if self._current_tag:
self._buffer.append(content)
def save_bulk_pages(self):
with open(os.path.join(self.save_dir, '{}.txt'.format(self.count_page)), 'w', encoding='utf-8') as file_text:
for item in self._pages:
if ":" in item[0] or \
'redirect' in item[1][:50].lower() or \
'đổi' in item[1][:50].lower():
# print(item[0])
pass
else:
file_text.write("=!= {} =!=\n\n{}\n\n".format(item[0], item[1]))
self._pages = []
def startElement(self, name, attrs):
"""Opening tag of element"""
if name in ('title', 'text', 'timestamp'):
self._current_tag = name
self._buffer = []
def endElement(self, name):
"""Closing tag of element"""
if name == self._current_tag:
self._values[name] = ' '.join(self._buffer)
if name == 'page':
self.count_page += 1
if self.count_page > 1 and (self.count_page % 10000) == 0:
print("{:,}".format(self.count_page))
self.save_bulk_pages()
# if self._values['title'] == 'Bản mẫu:Lịch tháng này':
# print('test')
self._pages.append((self._values['title'], WikiXmlHandler.clean_text(self._values['text'])))
# Object for handling xml
handler = WikiXmlHandler()
# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)
for i, line in enumerate(subprocess.Popen(['bzcat'],
stdin = open('viwiki-20191220-pages-meta-current.xml.bz2'),
stdout = subprocess.PIPE).stdout):
parser.feed(line)
handler.save_bulk_pages()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment