Skip to content

Instantly share code, notes, and snippets.

@enjoylife
Created February 4, 2012 18:29
Show Gist options
  • Save enjoylife/1739351 to your computer and use it in GitHub Desktop.
Save enjoylife/1739351 to your computer and use it in GitHub Desktop.
xml extract
# Split a Wikipedia XML dump
# Evan Jones <evanj@mit.edu>
# April, 2008
# Released under a BSD licence.
# http://evanjones.ca/software/wikipedia2text.html
import sys
import xml.sax
def writeArticle(title, text):
if text[0:9] != "#REDIRECT":
print title.encode("UTF-8")
print text.encode("UTF-8")
class WikiPageSplitter(xml.sax.ContentHandler):
def __init__(self):
self.stack = []
self.text = None
self.title = None
def startElement(self, name, attributes):
#~ print "start", name
if name == "page":
assert self.stack == []
self.text = None
self.title = None
elif name == "title":
assert self.stack == ["page"]
assert self.title is None
self.title = ""
elif name == "text":
assert self.stack == ["page"]
assert self.text is None
self.text = ""
else:
assert len(self.stack) == 0 or self.stack[-1] == "page"
return
self.stack.append(name)
def endElement(self, name):
#~ print "end", name
if len(self.stack) > 0 and name == self.stack[-1]:
del self.stack[-1]
if name == "text":
# We have the complete article: write it out
writeArticle(self.title, self.text)
def characters(self, content):
assert content is not None and len(content) > 0
if len(self.stack) == 0:
return
if self.stack[-1] == "title":
self.title += content
elif self.stack[-1] == "text":
assert self.title is not None
self.text += content
xml.sax.parse(sys.argv[1], WikiPageSplitter())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment