Skip to content

Instantly share code, notes, and snippets.

Created December 30, 2012 16:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save anonymous/4413657 to your computer and use it in GitHub Desktop.
Save anonymous/4413657 to your computer and use it in GitHub Desktop.
Imports Wikipedia (given as article dump) into a CouchDB and adds couchdb-lucene search capabilities
# Imports a Wikipedia dump into a CouchDB
# ... and makes it searchable by Apache Lucene
# REQUIREMENTS:
# - CouchDB
# - couchdb-lucene (https://github.com/rnewson/couchdb-lucene)
# - couchdb-python (http://code.google.com/p/couchdb-python/)
# Run CouchDB, start this script, run couchdb-lucene while script is running
# Trigger Indexing by Sample Query:
# http://127.0.0.1:5984/_fti/local/simple/_design/search/by_text
# ?q=information&limit=10
#import xml.etree.ElementTree as etree # for python-builtin
from lxml import etree # when LXML installed
import couchdb
import string
#WIKI = "./Wikipedia.Articles.2012.xml"
WIKI = "./simplewiki-20121220-pages-articles.xml"
DB = "simple" # "wiki"
server = couchdb.Server("http://127.0.0.1:5984/")
try: server.delete(DB)
except: pass
finally: server.create(DB)
couch = server[DB]
# ------------------------------------------------------------------------------
# LUCENE SETUP
couch.save(
{
"_id":"_design/search",
"fulltext": {
"by_title": {
"index":
"""
function(doc) {
var ret = new Document();
ret.add(doc.title);
return ret
}
"""
},
"by_text": {
"index":
"""
function(doc) {
var ret = new Document();
ret.add(doc.text);
return ret
}
"""
}
}
}
)
# ------------------------------------------------------------------------------
# XML IMPORTER
class WikiParser(object):
def start(self, tag, attrib):
if tag.endswith('text'):
self.is_text = True
self.text = ""
elif tag.endswith('title'):
self.is_title = True
self.title = ""
else:
self.is_title = self.is_text = False
def end(self, tag):
if tag.endswith('text') and len(self.text) > 0 and \
not self.text.startswith('#REDIRECT'):
try:
couch.save({'_id': self.title.strip(),
'title': self.title.strip(),
'text' : self.text.strip()})
except:
pass
self.is_title = self.is_text = False
#print self.title.strip(), len(self.text)
def data(self, data):
if self.is_title:
self.title += data
elif self.is_text:
self.text += data
def close(self):
pass
with file(WIKI) as wiki:
parser = etree.XMLParser(target = WikiParser())
etree.parse(wiki, parser)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment