Skip to content

Instantly share code, notes, and snippets.

@zeisss
Created January 11, 2010 15:58
Show Gist options
  • Save zeisss/274324 to your computer and use it in GitHub Desktop.
Save zeisss/274324 to your computer and use it in GitHub Desktop.
"""
Parse the wikipedia dump into tiny parts.
"""
import sys, string
from xml.sax import saxutils, handler, make_parser
# --- The ContentHandler
def storer(database_name = 'wikipedia-de'):
from couchdb import client
server = client.Server('http://localhost:5984/')
if not database_name in server:
print "Creating database %s" % database_name
server.create(database_name)
else:
db = server[database_name]
while True:
docs = (yield)
print "Bulk inserting %r documents ... " % len(docs),
db.update(docs)
print "done"
def grouper(storer, count = 10000):
bulk = []
try:
while True:
page = (yield)
print u"Adding %r" % page["title"]
bulk.append(page)
if len(bulk) == count:
storer.send(bulk)
bulk = []
finally:
# Make sure, any leftover doc is stored
storer.send(bulk)
class WikiParser(handler.ContentHandler):
path = []
page = dict()
def __init__(self, storer):
handler.ContentHandler.__init__(self)
self.storer = storer
# ContentHandler methods
def startElement(self, name, attrs):
if name == "mediawiki":
return
self.path.append(name)
if name == "page":
self.page = {'type':'http://types.moinz.de/wikipedia/Article'}
if name == "revision":
self.page['revision'] = {}
if name == "minor":
self.page['revision']['minor'] = True
if name == "contributor":
self.page['revision']['contributor'] = {}
def endElement(self, name):
self.path.pop();
if name == "page": # end of </page>
self.storer.send(self.page)
def characters(self, content):
# Metadata
if self.path == ["page", "title"]:
self.page['title'] = content
elif self.path == ["page", "id"]:
self.page['dump-id'] = content
elif self.path == ["page", "restrictions"]:
self.page['restrictions'] = content
elif self.path == ["page", "revision", "id"]:
self.page['revision']["id"] = content;
elif self.path == ["page", "revision", "text"]:
self.page['revision']["text"] = content;
elif self.path == ["page", "revision", "timestamp"]:
self.page['revision']["timestamp"] = content;
elif self.path == ["page", "revision", "comment"]:
self.page['revision']["comment"] = content;
# Contributor
elif self.path == ["page", "revision", "contributor", "username"]:
self.page['revision']["contributor"]['username'] = content;
elif self.path == ["page", "revision", "contributor", "ip"]:
self.page['revision']["contributor"]['ip'] = content;
elif self.path == ["page", "revision", "contributor", "id"]:
self.page['revision']["contributor"]['id'] = content;
# Rest
elif content.strip() == "" or self.path[0] == "siteinfo":
pass
else:
print "Unknown tag content for path %s: %s" % (self.path, content)
# def ignorableWhitespace(self, content):
# self._out.write(content)
# --- The main program
# The storer performs bulk dumps of the articles into the couchdb
s = storer()
s.next()
# This groups incoming documents into a list and sets the list to the next level
g = grouper(s, 10000)
g.next() # get the storer ready
parser = make_parser()
parser.setContentHandler(WikiParser(g))
parser.parse(sys.argv[1])
handler.stop()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment