Skip to content

Instantly share code, notes, and snippets.

@habedi
Forked from turicas/test_whoosh.py
Created December 6, 2019 13:27
Show Gist options
  • Save habedi/1a2b8ba3751f9d671433ca829da7f7a7 to your computer and use it in GitHub Desktop.
Save habedi/1a2b8ba3751f9d671433ca829da7f7a7 to your computer and use it in GitHub Desktop.
Some tests with whoosh (full-text search library written entirely in Python)
#!/usr/bin/env python
# coding: utf-8
# To bootstrap the environment:
# mkvirtualenv whoosh
# pip install whoosh
import os
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
class Index(object):
'''Integrate whoosh's indexer and searcher'''
def __init__(self, index_path, schema=None):
if not os.path.exists(index_path):
if schema is None:
raise ValueError('You need to specify a `schema` when creating'
' an index')
os.mkdir(index_path)
self._index = create_in(index_path, schema)
self._schema = schema
else:
self._index = open_dir(index_path)
self._schema = self._index.schema
def add_document(self, **document):
'''Add a document to the index.
Document properties should be passed as parameters, like in:
>>> my_index.add_document(title=u'My Title', content=u'The content')
'''
writer = self._index.writer()
writer.add_document(**document)
writer.commit()
def add_documents(self, documents):
'''Add a list of documents (`list` of `dict`s) to the index
It's an optimized version of `add_document` since it calls `commit`
only in the end.
'''
writer = self._index.writer()
for document in documents:
writer.add_document(**document)
writer.commit()
def search(self, query, field):
query_object = QueryParser(field, self._schema).parse(query)
searcher = self._index.searcher()
results = searcher.search(query_object)
return results
if __name__ == '__main__':
import shutil
index_path = 'index-test'
try:
shutil.rmtree(index_path)
except OSError:
pass
schema = Schema(filename=TEXT, id=ID(stored=True), content=TEXT)
my_index = Index(index_path, schema)
doc_1 = {u'filename': u'a.txt', u'id': u'1', u'content': u'first document'}
doc_2 = {u'filename': u'b.txt', u'id': u'2', u'content': u'2nd document'}
doc_3 = {u'filename': u'c.txt', u'id': u'3', u'content': u'3rd document'}
documents = [doc_1, doc_2, doc_3]
my_index.add_documents(documents)
result = my_index.search(u'first', u'content')
assert len(result) == 1
assert result[0][u'id'] == u'1'
doc_4 = {u'filename': u'a.txt', u'id': u'4', u'content': u'not first'}
my_index.add_document(**doc_4)
result = my_index.search(u'first', u'content')
assert len(result) == 2
assert set([doc[u'id'] for doc in result]) == set([u'1', u'4'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment