Skip to content

Instantly share code, notes, and snippets.

@barseghyanartur
Forked from frague59/es 2.0 integration
Created February 8, 2016 12:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save barseghyanartur/9ba2d57b490888bf32d4 to your computer and use it in GitHub Desktop.
Save barseghyanartur/9ba2d57b490888bf32d4 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Search features for :
* :mod:`elasticsearch.elasticsearch`
* :mod:`haystack:haystack`
* :mod:`elasticstack:elasticstack`
:creationdate: 05/11/15 15:05
:moduleauthor: François GUÉRIN <fguerin@ville-tourcoing.fr>
:modulename: intrautils.search
"""
import base64
import json
import logging
from copy import copy, deepcopy
import haystack
from django import forms
from django.conf import settings
from django.contrib.contenttypes.models import ContentType
from django.db import models as dj_models
from django.db.models.fields.files import FieldFile as dj_File
from django.utils.translation import ugettext_lazy as _
from elasticsearch import NotFoundError
from elasticstack.backends import ConfigurableElasticBackend, ConfigurableElasticSearchEngine
from elasticstack.fields import FacetField
from elasticstack.forms import SearchForm
from filer.models import File as fi_File
from form_utils.forms import BetterForm
from haystack import DEFAULT_ALIAS
from haystack.backends import SQ
from haystack.constants import DJANGO_CT, DJANGO_ID
from haystack.fields import SearchField
from haystack.forms import model_choices
from urllib3.fields import guess_content_type
from utils.forms import CollapsibleFieldsetFormMixin
__author__ = 'fguerin'
logger = logging.getLogger('intrautils.search')
DEFAULT_TYPE_MAPPINGS = {'type': 'string', 'analyzer': 'french'}
#: Type mapings
TYPE_MAPPINGS = {
'string': {'type': 'string', 'analyzer': 'french',},
'edge_ngram': {'type': 'string', 'analyzer': 'edgengram_analyzer'},
'ngram': {'type': 'string', 'analyzer': 'ngram_analyzer'},
'date': {'type': 'date'},
'datetime': {'type': 'date'},
'location': {'type': 'geo_point'},
'boolean': {'type': 'boolean'},
'float': {'type': 'float'},
'long': {'type': 'long'},
'integer': {'type': 'long'},
'attachment': {'type': 'attachment',
'fields': {
'content': {
'copy_to': 'copy',
'type': 'string',
'term_vector': 'with_positions_offsets',
'store': 'yes',
'analyzer': 'edgengram_analyzer'},
'title': {'analyzer': 'french'},
'author': {
'analyzer': 'edgengram_analyzer'},
'content_type': {
'analyzer': 'edgengram_analyzer'},
'content_length': {
'store': 'yes',
'type': 'integer'}},
}
}
class ExtendedElasticsearchBackend(ConfigurableElasticBackend):
"""
Adds ***attachment*** support for elasticsearch backend settings
"""
def setup(self):
"""
Defers loading until needed.
.. note::
This code is a copy of :meth:`haystack:haystack.backends.elastisearch_backend.ElasticsearchSearchBackend.setup`,
except that the _boost parameter has been removed.
"""
# Get the existing mapping & cache it. We'll compare it
# during the ``update`` & if it doesn't match, we'll put the new
# mapping.
try:
self.existing_mapping = self.conn.indices.get_mapping(index=self.index_name)
except NotFoundError:
pass
except Exception:
if not self.silently_fail:
raise
unified_index = haystack.connections[self.connection_alias].get_unified_index()
self.content_field_name, field_mapping = self.build_schema(unified_index.all_searchfields())
current_mapping = {
'modelresult': {
'properties': field_mapping,
}
}
if current_mapping != self.existing_mapping:
try:
# Make sure the index is there first.
self.conn.indices.create(index=self.index_name, body=self.DEFAULT_SETTINGS, ignore=400)
self.conn.indices.put_mapping(index=self.index_name, doc_type='modelresult', body=current_mapping)
self.existing_mapping = current_mapping
except Exception:
if not self.silently_fail:
raise
self.setup_complete = True
def extract_file_contents(self, file_obj):
contents = base64.decode(file_obj)
metadata = {'content_length': len(contents)}
return {'contents': contents, 'metadata': metadata}
def build_schema(self, fields):
"""
Merge from `haystack` and `elasticstack` `elasticsearch` backend `build_shema` methods.
It provides an additional feature : custom field mappings, from settings or default FIELD_MAPPINGS dict.
:param fields: fields to map to the backend
:returns: tuple content_field_name, mapping
"""
content_field_name = ''
final_mapping = {
DJANGO_CT: {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
DJANGO_ID: {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
}
type_mappings = copy(TYPE_MAPPINGS)
default_type_mappings = copy(DEFAULT_TYPE_MAPPINGS)
settings.DEBUG and logger.debug(u'ExtendedElasticsearchBackend::build_schema() '
u'default_type_mappings = \n%s'
u'\ntype_mappings = \n%s',
json.dumps(default_type_mappings, indent=2),
json.dumps(type_mappings, indent=2))
for field_name, field_class in fields.items():
field_type = field_class.field_type
_mapping_for_field = type_mappings.get(field_type, default_type_mappings)
# settings.DEBUG and logger.debug(u'ExtendedElasticsearchBackend::build_schema() '
# u'field_name = %s / field_type = %s / _mapping_for_field = \n%s',
# field_name, field_type, json.dumps(_mapping_for_field, indent=2))
if field_class.boost != 1.0:
_mapping_for_field['boost'] = field_class.boost
if field_class.document is True:
content_field_name = field_class.index_fieldname
# Do this last to override `text` fields.
if _mapping_for_field['type'] == 'string' and field_class.indexed:
if not hasattr(field_class, 'facet_for') and not field_class.field_type in ('ngram', 'edge_ngram'):
_mapping_for_field['analyzer'] = getattr(field_class, 'analyzer', self.DEFAULT_ANALYZER)
final_mapping[field_class.index_fieldname] = _mapping_for_field
settings.DEBUG and logger.debug(u'ExtendedElasticsearchBackend::build_schema() '
u'mapping = \n%s',
json.dumps(final_mapping, indent=2))
return content_field_name, final_mapping
def more_like_this(self, model_instance, additional_query_string=None, result_class=None, **kwargs):
"""
Gives "more like this" items
:param model_instance: model instance
:param additional_query_string: additional srting
:param result_class: result
:param kwargs: additional kwargs
:returns: super
"""
return super(ExtendedElasticsearchBackend, self).more_like_this(model_instance, additional_query_string,
result_class, **kwargs)
def update(self, index, iterable=None, commit=True):
return super(ExtendedElasticsearchBackend, self).update(index, iterable)
def build_search_kwargs(self, query_string, sort_by=None, start_offset=0, end_offset=None, fields='',
highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None,
spelling_query=None, within=None, dwithin=None, distance_point=None, models=None,
limit_to_registered_models=None, result_class=None):
return super(ExtendedElasticsearchBackend, self).build_search_kwargs(query_string, sort_by, start_offset,
end_offset, fields,
highlight, facets, date_facets,
query_facets, narrow_queries,
spelling_query, within, dwithin,
distance_point, models,
limit_to_registered_models, result_class)
class ExtendedElasticSearchEngine(ConfigurableElasticSearchEngine):
backend = ExtendedElasticsearchBackend
class AttachmentField(SearchField):
"""
Mapping for an `AttachmentField`
"""
field_type = 'attachment'
author_field_name = 'user_author'
author = None
def __init__(self, **kwargs):
if 'content_type_field' in kwargs:
self.content_type_field = kwargs.pop('content_type_field')
if 'author' in kwargs:
self.author = kwargs.pop(self.author_field_name)
super(AttachmentField, self).__init__(**kwargs)
def convert(self, value):
"""
Convert an attachment file to serializable data
:param value: value to convert
:returns: converted data
"""
output = value
return output
@staticmethod
def _get_file_data(field):
if isinstance(field, fi_File):
field_file = field.file
title = name = field.label
content_type = guess_content_type(name)
try:
content = base64.b64encode(field_file.read())
except AttributeError:
content = base64.b64encode(field_file)
try:
content_length = len(field_file)
except TypeError:
content_length = len(field_file.file)
else: # isinstance(field, dj_File):
field_file = field
title = name = field_file.name
content_type = guess_content_type(name)
try:
content_length = len(field_file)
except TypeError:
content_length = len(field_file.file)
try:
content = base64.b64encode(field_file.read())
except AttributeError:
content = base64.b64encode(field_file)
output = {'_language': 'fr',
'_content': content,
'_content_type': content_type,
'_name': name,
'_title': title,
'_content_length': content_length}
# output = content
return output
def prepare(self, obj):
if self.model_attr:
field = getattr(obj, self.model_attr)
else:
field = obj
if not isinstance(field, (dj_File, fi_File)):
raise NotImplementedError('AttachmentField does not implement file reading for %s file'
% field.__class__.__name__)
output = self._get_file_data(field)
if settings.DEBUG:
_output = deepcopy(output)
_output.update({'_content': _output['_content'][:50] + '...'})
logger.debug(u'AttachmentField::prepare() output = %s', json.dumps(_output, indent=2))
return output
class FacetedAttachmentField(FacetField, AttachmentField):
"""
Glue class to bind together `FacetField` and `AttachmentField`
"""
pass
def application_model_choices(app_name, using=DEFAULT_ALIAS):
choices = model_choices(using)
output = []
if isinstance(app_name, (tuple, list)):
for app in app_name:
output.extend(application_model_choices(app, using))
else:
for choice in choices:
if app_name in choice[0]:
output.append(choice)
output = sorted(output, key=(lambda x: x[1]))
return output
class HaystackSearchForm(CollapsibleFieldsetFormMixin, SearchForm, BetterForm):
"""
:mod:`haystack:haystack` search form for main `searching` feature
"""
class Meta:
fieldsets = (('main', {'legend': _('search'), 'fields': ('search_query', 'models', 'more_like_this')}),)
search_field_name = 'search_query'
load_all = True
#: can be a single application or a list of applications
search_app = None
#: global search field
search_query = forms.CharField(label=_('Search'), required=False, max_length=255,
help_text=_('You can use the wildcard * to search for words fragments, '
'by example "comm*" will search for words starting by "comm". '
'You can also write more than a word, each word will be searched.'))
# more_like_this = forms.BooleanField(label=_('More like this'), required=False)
def get_search_apps(self):
if self.search_app:
return self.search_app
return None
def get_models(self):
"""
Return an alphabetical list of model classes in the index.
"""
search_models = []
if self.is_valid():
for model in self.cleaned_data['models']:
# noinspection PyUnresolvedReferences
search_models.append(dj_models.get_model(*model.split('.')))
return search_models
def get_filters(self, search_query):
"""
Build filter from a search_query
:param search_query: search query
:returns: built filters
"""
searched = search_query.strip('*')
if ' ' in searched:
filters = SQ()
search_list = search_query.split(' ')
for item in search_list:
sub_filters = SQ(text__contains=item.strip('*'))
if item.startswith('*'):
sub_filters |= SQ(text__endswith=item.strip('*'))
if item.endswith('*'):
sub_filters |= SQ(text__startswith=item.strip('*'))
filters &= sub_filters
else:
filters = SQ(text__contains=searched)
if search_query.startswith('*'):
filters |= SQ(text__endswith=searched)
if search_query.endswith('*'):
filters |= SQ(text__startswith=searched)
settings.DEBUG and logger.debug(u'HaystackSearchForm::get_filters(%s) filters = %s', search_query, filters)
return filters
@staticmethod
def get_fields():
"""
Gets the fields for the search
:returns: list of fields
"""
fields = ['document_file.content', 'text', 'content', 'title', ]
settings.DEBUG and logger.debug(u'HaystackSearchForm::get_fields() fields = %s', fields)
return fields
def search(self):
if not self.is_valid():
return self.no_query_found()
if not self.cleaned_data.get(self.search_field_name):
return self.no_query_found()
search_apps = self.get_search_apps()
search_query = self.cleaned_data.get(self.search_field_name, None)
search_models = self.get_models()
more_liks_this = self.cleaned_data.get('more_like_this', False)
filters = self.get_filters(search_query)
if search_models:
sub_filters = None
for model in search_models:
model_ct = ContentType.objects.get_for_model(model)
_filter = SQ(django_ct__iexact='%s.%s' % (model_ct.app_label, model_ct.model))
sub_filters = (sub_filters | _filter) if sub_filters else _filter
filters = filters & sub_filters if filters else sub_filters
else:
if isinstance(search_apps, basestring):
filters &= SQ(django_ct__startswith=search_apps)
elif isinstance(search_apps, (tuple, list)):
sub_filters = None
for search_app in search_apps:
_filter = SQ(django_ct__startswith=search_app)
if sub_filters:
sub_filters |= _filter
else:
sub_filters = _filter
if sub_filters:
filters &= sub_filters
search_query_set = self.searchqueryset.filter(filters)
settings.DEBUG and logger.debug(u'HaystackSearchForm::search() '
u'search_query_set.query = %s (%d)', search_query_set.query,
len(search_query_set))
# Search for data
if not search_query_set:
search_query_set = search_query_set.load_all()
# Search for `more_liks_this` items
if search_query and more_liks_this:
search_query_set = search_query_set.more_like_this(search_query).load_all()
if self.load_all:
search_query_set = search_query_set.load_all()
settings.DEBUG and logger.debug(u'HaystackSearchForm::search() search_query (1) = %s ', search_query_set.query)
settings.DEBUG and logger.debug(u'HaystackSearchForm::search() len(search_query_set) = %d '
u'(after models filtering)', len(search_query_set))
return search_query_set
@staticmethod
def no_query_found():
return []
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment