Create a gist now

Instantly share code, notes, and snippets.

Experimental django-haystack 2 backend which adds support for Solr's grouping / field collapsing
# encoding: utf-8
"""Experimental Solr Grouping / Field Collapsing backend for Haystack 2.0"""
# NOTE: You must be running the latest Pysolr master - no PyPI release yet!
# See https://gist.github.com/3750774 for the current version of this code
# See http://wiki.apache.org/solr/FieldCollapsing for the Solr feature documentation
from __future__ import absolute_import
import logging
from django.db.models.loading import get_model
from haystack.backends import EmptyResults
from haystack.backends.solr_backend import SolrEngine, SolrSearchBackend, SolrSearchQuery
from haystack.constants import DJANGO_CT, DJANGO_ID, ID
from haystack.models import SearchResult
from haystack.query import SearchQuerySet
# Since there's no chance of this being portable (yet!) we'll import explicitly
# rather than using the generic imports:
class GroupedSearchQuery(SolrSearchQuery):
def __init__(self, *args, **kwargs):
super(GroupedSearchQuery, self).__init__(*args, **kwargs)
self.grouping_field = None
self._total_document_count = None
def _clone(self, **kwargs):
clone = super(GroupedSearchQuery, self)._clone(**kwargs)
clone.grouping_field = self.grouping_field
return clone
def add_group_by(self, field_name):
self.grouping_field = field_name
def post_process_facets(self, results):
# FIXME: remove this hack once https://github.com/toastdriven/django-haystack/issues/750 lands
# See matches dance in _process_results below:
total = 0
if 'hits' in results:
total = int(results['hits'])
elif 'matches' in results:
total = int(results['matches'])
self._total_document_count = total
return super(GroupedSearchQuery, self).post_process_facets(results)
def get_total_document_count(self):
"""Return the total number of matching documents rather than document groups
If the query has not been run, this will execute the query and store the results.
"""
if self._total_document_count is None:
self.run()
return self._total_document_count
def build_params(self, *args, **kwargs):
res = super(GroupedSearchQuery, self).build_params(*args, **kwargs)
if self.grouping_field is not None:
res.update({'group': 'true',
'group.field': self.grouping_field,
'group.ngroups': 'true',
'group.limit': 2, # TODO: Don't hard-code this
'group.sort': 'django_ct desc, score desc',
'group.facet': 'true',
'result_class': GroupedSearchResult})
return res
class GroupedSearchResult(object):
def __init__(self, field_name, group_data, raw_results={}):
self.field_name = field_name
self.key = group_data['groupValue'] # TODO: convert _to_python
self.hits = group_data['doclist']['numFound']
self.documents = list(self.process_documents(group_data['doclist']['docs'],
raw_results=raw_results))
def __unicode__(self):
return 'GroupedSearchResult({0.field_name}={0.group_key}, hits={0.hits})'.format(self)
def process_documents(self, doclist, raw_results):
# TODO: tame import spaghetti
from haystack import connections
engine = connections["en"]
conn = engine.get_backend().conn
unified_index = engine.get_unified_index()
indexed_models = unified_index.get_indexed_models()
for raw_result in doclist:
app_label, model_name = raw_result[DJANGO_CT].split('.')
additional_fields = {}
model = get_model(app_label, model_name)
if model and model in indexed_models:
for key, value in raw_result.items():
index = unified_index.get_index(model)
string_key = str(key)
if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
additional_fields[string_key] = index.fields[string_key].convert(value)
else:
additional_fields[string_key] = conn._to_python(value)
del(additional_fields[DJANGO_CT])
del(additional_fields[DJANGO_ID])
del(additional_fields['score'])
if raw_result[ID] in getattr(raw_results, 'highlighting', {}):
additional_fields['highlighted'] = raw_results.highlighting[raw_result[ID]]
result = SearchResult(app_label, model_name, raw_result[DJANGO_ID],
raw_result['score'], **additional_fields)
yield result
class GroupedSearchQuerySet(SearchQuerySet):
def __init__(self, *args, **kwargs):
super(GroupedSearchQuerySet, self).__init__(*args, **kwargs)
if not isinstance(self.query, GroupedSearchQuery):
raise TypeError("GroupedSearchQuerySet must be used with a GroupedSearchQuery query")
def group_by(self, field_name):
"""Have Solr group results based on the provided field name"""
clone = self._clone()
clone.query.add_group_by(field_name)
return clone
def post_process_results(self, results):
# Override the default model-specific processing
return results
def total_document_count(self):
"""Returns the count for the total number of matching documents rather than groups
A GroupedSearchQuerySet normally returns the number of document groups; this allows
you to indicate the total number of matching documents - quite handy for making facet counts match the
displayed numbers
"""
if self.query.has_run():
return self.query.get_total_document_count()
else:
clone = self._clone()
return clone.query.get_total_document_count()
class GroupedSolrSearchBackend(SolrSearchBackend):
def build_search_kwargs(self, *args, **kwargs):
group_kwargs = [(i, kwargs.pop(i)) for i in kwargs.keys() if i.startswith("group")]
res = super(GroupedSolrSearchBackend, self).build_search_kwargs(*args, **kwargs)
res.update(group_kwargs)
if group_kwargs and 'sort' not in kwargs:
res['sort'] = 'score desc, item_id asc'
return res
def _process_results(self, raw_results, result_class=None, **kwargs):
res = super(GroupedSolrSearchBackend, self)._process_results(raw_results,
result_class=result_class,
**kwargs)
if result_class and not issubclass(result_class, GroupedSearchResult):
return res
if len(raw_results.docs):
raise RuntimeError("Grouped Solr searches should return grouped elements, not docs!")
assert not res['results']
assert not res['hits']
if isinstance(raw_results, EmptyResults):
return res
assert len(raw_results.grouped) == 1, "Grouping on more than one field is not supported"
res['results'] = results = []
for field_name, field_group in raw_results.grouped.items():
res['hits'] = field_group['ngroups']
res['matches'] = field_group['matches']
for group in field_group['groups']:
if group['groupValue'] is None:
logging.warning("Unexpected NULL grouping", extra={'data': raw_results})
res['hits'] -= 1 # Avoid confusing Haystack with excluded bogon results
continue
results.append(result_class(field_name, group, raw_results=raw_results))
return res
class GroupedSolrEngine(SolrEngine):
backend = GroupedSolrSearchBackend
query = GroupedSearchQuery
@mynameistechno

Hey @acdha, this looks great, have you been using it? I.e. how stable is this for production use? Thanks

@acdha
Owner
acdha commented Jul 29, 2014

@mynameistechno I've been using it in production since last November: http://chris.improbable.org/2014/3/17/content-search-on-a-budget/

It's not feature complete with all of the other backends but it handles the subset of functionality which I need.

@TippyTipster

Do you have any tutorials or documentation on how to implement this? I'm fairly new to haystack/solr and can't seem to figure out where/how I should be using these.

@Pablo1990

Hey!

I also need some tutorial on how implement this, where I should add this code?

Thanks in advance!

@acdha
Owner
acdha commented Feb 29, 2016

@tippyTipster, @Pablo1990: belated reply since I never saw a notification for your comments but for the record: I have that in a separate module (e.g. my_project/search/grouped.py and my Django settings module has 'ENGINE': 'my_project.search.grouped' in the Haystack connection configuration following the examples at http://django-haystack.readthedocs.org/en/v2.4.1/tutorial.html#modify-your-settings-py.

@acdha
Owner
acdha commented Sep 12, 2016

Anyone who finds this of interest but wants to use the newer / maybe-faster Collapsing filter and Expand component: https://gist.github.com/acdha/0a66ca23984bc8d607936fecd9c29941

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment