Skip to content

Instantly share code, notes, and snippets.

@FSX
Created August 19, 2010 17:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save FSX/538415 to your computer and use it in GitHub Desktop.
Save FSX/538415 to your computer and use it in GitHub Desktop.
"""
Url: http://61924.nl/posts/00038-whoosh
Example:
>>> from searchengine import *
>>> search = SearchEngine('./index')
>>> search.create_index()
>>> search.add_document('http://example.org/somedocument',
'The document title', 'The content of the document')
>>> search.commit()
"""
import os, os.path, re
import whoosh.index
import whoosh.fields
import whoosh.qparser
RE_GIST_JS = re.compile('<script(?: type="text\/javascript")? src="'
'http:\/\/gist.github.com\/([0-9]+)\.js"><\/script>')
def replace_gist_js(text):
return RE_GIST_JS.sub(
'<p><a href="http://gist.github.com/\1">[Gist \1]</a></p>', text)
REGEX_HTML_TAG = re.compile('<[^<]*?/?>')
def strip_html_tags(text):
return REGEX_HTML_TAG.sub('', text)
class SearchEngine(object):
schema = whoosh.fields.Schema(
url=whoosh.fields.ID(unique=True, stored=True),
title=whoosh.fields.TEXT(stored=True, phrase=False),
content=whoosh.fields.TEXT())
def __init__(self, index_path):
self.path = index_path
if not os.path.exists(index_path):
os.makedirs(index_path)
def create_index(self):
whoosh.index.create_in(self.path, self.schema)
self.open_index()
def open_index(self):
self._index = whoosh.index.open_dir(self.path)
self._writer = self._index.writer()
def index_exists(self):
return whoosh.index.exists_in(self.path)
def add_document(self, url, title, content):
self._writer.add_document(
url=unicode(url),
title=unicode(title),
content=unicode(content))
def update_document(self, url, title, content):
self._writer.update_document(
url=unicode(url),
title=unicode(title),
content=unicode(content))
def delete_document(self, url):
self._index.delete_by_term('url', unicode(url))
def commit(self):
self._writer.commit(optimize=True)
def cancel(self):
self._writer.cancel()
_queryparser = whoosh.qparser.QueryParser('content', schema=schema)
def find(self, querystring):
s = self._index.searcher()
return s.search(self._queryparser.parse(unicode(querystring)),
limit=50)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment