Skip to content

Instantly share code, notes, and snippets.

@stillinbeta
Created June 18, 2011 21:50
Show Gist options
  • Save stillinbeta/1033536 to your computer and use it in GitHub Desktop.
Save stillinbeta/1033536 to your computer and use it in GitHub Desktop.
A parser to write wikipedia links into couchdb-compatible JSON
import re
import json
from xml import sax
reg = re.compile(r'\[\[(?:[\w ]+?\|)?([\w ]+?)\]\]')
datafile = '/path/to/wikipedia/data'
limit = 500
class PageHandler(sax.handler.ContentHandler):
def __init__(self):
self.outwriter = "/tmp/wiki/wikilinks-{0}.json"
self.out = open(self.outwriter.format(0),'w')
self.out.write('{"docs":[')
self.count = 0
self.in_title = False
self.title = ''
self.in_text = False
self.text = ''
def startElement(self, name, attributes):
if name == 'title':
self.in_title = True
elif name == 'text':
self.in_text = True
def characters(self, data):
if self.in_title:
self.title += data
elif self.in_text:
self.text += data
def endElement(self, name):
if name == 'title':
self.in_title = False
elif name == 'text':
self.in_text = False
self.out.write(json.dumps({'_id':self.title,
'links':reg.findall(self.text)}))
self.out.write(',\n')
print(self.title+' '+str(self.count))
self.title = ''
self.text = ''
self.count+=1
if self.count and self.count % limit == 0:
self.out.write(']}')
self.out.close()
self.out = open(self.outwriter.format(int(self.count/limit)),'w')
self.out.write('{"docs":[')
handler = PageHandler()
sax.parse(datafile,handler)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment