Skip to content

Instantly share code, notes, and snippets.

@paulorsbrito
Forked from gregplaysguitar/folding_whoosh_backend.py
Last active December 25, 2015 07:28
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save paulorsbrito/6939000 to your computer and use it in GitHub Desktop.
Save paulorsbrito/6939000 to your computer and use it in GitHub Desktop.
This is a fork of Greg's gist, which allows use of character folding with haystack and whoosh. What I've done was to add allow EdgeNGram fields to be folded. This is useful if you're using autocomplete or just indexing text as EdgeNGram field to allow partial keywork search. Thanks, Greg. Have fun.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This is a fork of Greg's gist, which allows use of character folding with haystack and whoosh. What
I've done was to add allow EdgeNGram fields to be folded. This is useful if you're using autocomplete
or just indexing text as EdgeNGram field to allow partial keywork search. Thanks, Greg. Have fun.
-+- original comments -+
https://gist.github.com/gregplaysguitar/1727204
Whoosh backend for haystack that implements character folding, as per http://packages.python.org/Whoosh/stemming.html#character-folding .
To use, put this file on your path and add it to your haystack settings, eg.
HAYSTACK_CONNECTIONS = {
'default': {
'ENGINE': 'search_backends.FoldingWhooshEngine',
'PATH': 'path-to-whoosh-index',
},
}
"""
from haystack.backends.whoosh_backend import WhooshEngine, WhooshSearchBackend, WHOOSH_ID, ID, DJANGO_CT, DJANGO_ID, Schema, IDLIST, TEXT, KEYWORD, NUMERIC, BOOLEAN, DATETIME, NGRAM, NGRAMWORDS, WhooshSearchQuery
from haystack.exceptions import SearchBackendError
from whoosh.analysis import CharsetFilter, StemmingAnalyzer
from whoosh.support.charset import accent_map
class FoldingWhooshSearchBackend(WhooshSearchBackend):
def build_schema(self, fields):
schema_fields = {
ID: WHOOSH_ID(stored=True, unique=True),
DJANGO_CT: WHOOSH_ID(stored=True),
DJANGO_ID: WHOOSH_ID(stored=True),
}
# Grab the number of keys that are hard-coded into Haystack.
# We'll use this to (possibly) fail slightly more gracefully later.
initial_key_count = len(schema_fields)
content_field_name = ''
filter = CharsetFilter(accent_map)
analyzer = StemmingAnalyzer() | filter
for field_name, field_class in fields.items():
if field_class.is_multivalued:
if field_class.indexed is False:
schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost)
else:
schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True, field_boost=field_class.boost)
elif field_class.field_type in ['date', 'datetime']:
schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored)
elif field_class.field_type == 'integer':
schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=int, field_boost=field_class.boost)
elif field_class.field_type == 'float':
schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=float, field_boost=field_class.boost)
elif field_class.field_type == 'boolean':
# Field boost isn't supported on BOOLEAN as of 1.8.2.
schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored)
elif field_class.field_type == 'ngram':
schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, analyzer=analyzer, maxsize=15, stored=field_class.stored, field_boost=field_class.boost)
elif field_class.field_type == 'edge_ngram':
schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost)
schema_fields[field_class.index_fieldname].analyzer |= filter
else:
schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=analyzer, field_boost=field_class.boost)
if field_class.document is True:
content_field_name = field_class.index_fieldname
# Fail more gracefully than relying on the backend to die if no fields
# are found.
if len(schema_fields) <= initial_key_count:
raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.")
return (content_field_name, Schema(**schema_fields))
class FoldingWhooshEngine(WhooshEngine):
backend = FoldingWhooshSearchBackend
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment